def build_model(self, **kwargs): ti = kwargs['ti'] streams = ti.xcom_pull(task_ids='load_stream') users = ti.xcom_pull(task_ids='load_users') ab_user = ti.xcom_pull(task_ids='label') # preprocessing users['abnormal'] = [0] * users.shape[0] users['birth_year'].loc[users['birth_year'] == ''] = users[ 'birth_year'].value_counts().index[0] users['birth_year'] = users['birth_year'].astype('int') for i in ['access', 'gender']: le = preprocessing.LabelEncoder() users[i] = le.fit_transform(users[i]) # feature engineering vectorizer = CountVectorizer() streams = streams.groupby('user_id')['track_id'].apply( lambda x: ' '.join(x)) counts = vectorizer.fit_transform(streams) transformer = TfidfTransformer() tfidf = transformer.fit_transform(counts) als = implicit.als.AlternatingLeastSquares(factors=50) als.fit(tfidf.T) # item_user matrix user_factors = pd.DataFrame(data=als.user_factors) users = users.sort_values(by='user_id') users.reset_index(drop=True, inplace=True) user_factors.reset_index(drop=True, inplace=True) users = pd.concat([users, user_factors], axis=1, ignore_index=True) users.rename(columns={ 0: 'access', 1: 'birth_year', 2: 'country', 3: 'gender', 4: 'user_id', 5: 'abnormal' }, inplace=True) # set X, y y = users['abnormal'] X = users.drop(['abnormal', 'user_id'], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42) # training clf = RandomForestClassifier(n_jobs=10) clf.fit(X_train, y_train) clf.score(X_test, y_test) # testing y_pred = clf.predict(X_test) cf = pd.DataFrame(data=confusion_matrix(y_test, y_pred)) gcs.Bucket('ru_test').item('cf.csv').write_to(cf.to_csv(index=False), 'text/csv')
class TabsMR(object): def __init__(self): self.rawdata = None self.access_granted = False # digunakan untuk verify api self.access_message = None self.bucket_name = None self.registered_rawdata = False def credentials(self, certificate): """ certificate = { "account_type": "tabulation", "client":"clientname", "project_id": "project_id", "private_key_id": "a4a9f3c7600081ea9bad46ece1b158e2f16454e2", "user_email": "*****@*****.**", "user_id": "101670599119528512817", } """ self.certificate = certificate conn = psycopg2.connect( host='localhost', port=54320, dbname='my_database', user='******', ) cur = conn.cursor() cur.execute(""" SELECT private_key_id FROM public.credentials WHERE account_type={0} AND project_id={0} AND user_email={0} AND user_id={0};""".format( certificate["account_type"], certificate["project_id"], certificate["user_email"], certificate["user_id"] )) rec = cur.fectone() if rec["private_key_id"] == certificate["private_key_id"]: self.access_granted = True self.access_message = "Your access credentials is granted!" else: self.access_message = "Your access credentials is denied, Please recheck your key!" def do_register_rawdata(self, csv_path): """ register rawdata ke google storage bucket """ if self.registered_rawdata == False: ¸ a = pd.read_csv(csv_path) gcs.Bucket(self.bucket_name).item(certificate["clientname"]+"/dataset_"+certificate["project_id"] +".csv")\ .write_to(a.to_csv(),'text/csv') self.registered_rawdata == True print("Your rawdata is already registered")
def update_record(station, weather): print(f"Updating {station['address']}") if "/" in station['address']: data = pd.read_csv( 'gs://dbikes-planner.appspot.com/station_records/Princes Street.csv' ) else: data = pd.read_csv( f"gs://dbikes-planner.appspot.com/station_records/{station['address']}.csv" ) epoch_time = station['last_update'] entry_datetime = datetime.fromtimestamp(epoch_time / 1000) if is_time_between(time(3, 30), time(5, 0), entry_datetime.time()): return last_line = data.tail(3).to_csv() if entry_datetime.isoformat() in last_line: return day_index = entry_datetime.weekday() if day_index <= 4: day_type = 0 else: day_type = 10 new_row = { 'available_bikes': station['available_bikes'], 'available_bike_stands': station['available_bike_stands'], 'time_of_day': datetime_to_seconds(entry_datetime), 'type_of_day': day_type, 'day_of_year': entry_datetime.timetuple().tm_yday, 'iso_date': entry_datetime.isoformat(), 'temperature': weather['temperature'], 'relative_humidity': weather['humidity'], 'wind_speed': weather['wind_speed'], 'rain': weather['rain'], 'visibility': weather['visibility'], 'bike_availability': category(station['available_bikes']), 'bike_stand_availability': category(station['available_bike_stands']), 'unix_timestamp': entry_datetime.timestamp() // 3600 } new_row_dataframe = pd.DataFrame(new_row, index=[0]) combined_df = pd.concat([data, new_row_dataframe], ignore_index=True) gcs.Bucket('dbikes-planner.appspot.com').item(f'station_records/{station["address"]}.csv') \ .write_to(combined_df.to_csv(index=False), 'text/csv')
def load_stream(self, **kwargs): os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = '/disk/ru/My First Project-d1196e9f3e13.json' s = str(gcs.Bucket('ru_test').item('streams').read_from(), 'utf-8') streams = pd.read_json(StringIO(s), lines=True) streams['timestamp'] = pd.to_datetime(streams['timestamp'], format='%Y-%m-%d %H:%M:%S') streams.sort_values(by=['timestamp'], inplace=True) streams.reset_index(inplace=True, drop=True) streams['delta'] = streams.groupby( ['user_id'])['timestamp'].diff().dt.total_seconds() return streams
from datalab.context import Context import datalab.storage as storage import datalab.bigquery as bq import pandas as pd # Dataframe to write simple_dataframe = pd.DataFrame(data=[{1,2,3},{4,5,6}],columns=['a','b','c']) sample_bucket_name = Context.default().project_id + '-datalab-example' sample_bucket_path = 'gs://' + sample_bucket_name sample_bucket_object = sample_bucket_path + '/Hello.txt' bigquery_dataset_name = 'TestDataSet' bigquery_table_name = 'TestTable' # Define storage bucket sample_bucket = storage.Bucket(sample_bucket_name) # Create storage bucket if it does not exist if not sample_bucket.exists(): sample_bucket.create() # Define BigQuery dataset and table dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) # Create BigQuery dataset if not dataset.exists(): dataset.create() # Create or overwrite the existing table if it exists table_schema = bq.Schema.from_dataframe(simple_dataframe)
def write_to_storage(bucket_name, df_to_write, file_name_in_bucket): gcs.Bucket(bucket_name).item(file_name_in_bucket).write_to( df_to_write.to_csv(), 'text/csv')
Gini(train_label[valid_index], cv_train[valid_index])) cv_pred += model.predict(x=X_test, batch_size=512, verbose=0)[:, 0] print(Gini(train_label, cv_train)) result = pd.DataFrame({'id': test_id, 'target': cv_pred * 1. / (NFOLDS)}) return result if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data-dir', help='GCS or local path to training and testing data', required=True) parser.add_argument('--output-name', help='GCS or local path to training and testing data', required=True) args = parser.parse_args() # v1 # arguments = args.__dict__ # data_dir = arguments.pop("data_dir") # output_name= arguments.pop("output_name") #v2 data_dir = args.data_dir output_name = args.output_name outputpath = "Potro/output/" + str(output_name) + '.csv' result = main(data_dir) gcs.Bucket('stevenwho').item(outputpath).write_to(result.to_csv(), 'text/csv')
def train(self): self.train_model = Model(self.max_size) # to be changed batchsize = 275 self.train_model.build_model() saver = tf.train.Saver(max_to_keep=10) summary_proto = tf.Summary() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) writer_file_path = os.path.join(FLAGS.output_dir, FLAGS.experiment_name, 'improved_graph') checkpoint_file = os.path.join(FLAGS.output_dir, FLAGS.experiment_name, 'checkpoints') writer = tf.summary.FileWriter(writer_file_path, sess.graph) for epoch in range(0, self.epochs): print("Epoch number " + str(epoch)) batch_idx = 0 training_loss = 0.0 for batch in self.shuffle_batches(self.X_train, self.y_train, batchsize): inputs, targets = batch feed_dict = {self.train_model.x: inputs, self.train_model.y: targets, self.train_model.is_training: True} global_step, summary_train, accuracy, network_loss, _ = sess.run([self.train_model.global_step, self.train_model.summary_ops, self.train_model.acc_op, self.train_model.loss, self.train_model.train_op], feed_dict=feed_dict) training_loss += network_loss batch_idx += 1 writer.add_summary(summary_train, global_step=global_step) if batch_idx % 1 == 0: print('Epoch ', epoch, ' and Batch ', batch_idx, ' | training loss is ', training_loss / batch_idx) # if batch_idx % 10 == 0: # saver.save(sess, checkpoint_file, global_step=global_step) # summary_proto.ParseFromString(summary_train) num_of_training_batches = batch_idx validation_loss = 0. batch_idx = 0 #VALIDATION validation_feed = {self.train_model.x: self.X_test, self.train_model.y: self.y_test, self.train_model.is_training: False} [predicted_classes] = sess.run([self.train_model.pred_classes], feed_dict=validation_feed) predicted_classes = predicted_classes print(predicted_classes) test_acc = f1_score(predicted_classes, np.array(self.y_test), average='micro') print('Epoch ', epoch, ' got score of ', test_acc) #FINAL TESTINg testing_feed = {self.train_model.x: self.test_input, self.train_model.is_training: False} [predicted_classes] = sess.run([self.train_model.pred_classes], feed_dict=testing_feed) test_input_pred = predicted_classes predicted_output = {'y': test_input_pred} predicted_output_df = pd.DataFrame(data=predicted_output) #predicted_output_df.set_index('id') if epoch % 5 == 0: print('Epoch ', epoch, ' saved a new estimated file') gcs.Bucket('aml-project3').item('output_deep/data.csv').write_to(predicted_output_df.to_csv(index_label='id'),'text/csv')
def read_csv(file, nrows,usecols=all): stream = dlb_storage.Bucket('project-bagandata').item(file).read_from() data = pd.read_csv(BytesIO(stream), engine='c', nrows=nrows, low_memory=False, header='infer', usecols=usecols) return data
def load_users(self, **kwargs): os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = '/disk/ru/My First Project-d1196e9f3e13.json' s = str(gcs.Bucket('ru_test').item('users').read_from(), 'utf-8') users = pd.read_json(StringIO(s), lines=True) return users