def get_size(data_dir): if not exists(os.path.join(data_dir, "train_parquet")) or \ not exists(os.path.join(data_dir, "test_parquet")): raise Exception("Not train and test data parquet specified") else: train_tbl = FeatureTable.read_parquet( os.path.join(data_dir, "train_parquet")) test_tbl = FeatureTable.read_parquet( os.path.join(data_dir, "test_parquet")) # get cat sizes with tempfile.TemporaryDirectory() as local_path: get_remote_file_to_local( os.path.join(data_dir, "meta/categorical_sizes.pkl"), os.path.join(local_path, "categorical_sizes.pkl")) with open(os.path.join(local_path, "categorical_sizes.pkl"), 'rb') as f: cat_sizes_dic = pickle.load(f) indicator_sizes = [cat_sizes_dic[c] for c in indicator_cols] print("indicator sizes: ", indicator_sizes) embedding_sizes = [cat_sizes_dic[c] for c in embedding_cols] print("embedding sizes: ", embedding_sizes) cross_sizes = [cat_sizes_dic[c] for c in cross_cols] return train_tbl, test_tbl, indicator_sizes, embedding_sizes, cross_sizes
def test_exists_s3(self): access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id and secret_access_key: file_path = "s3://analytics-zoo-data/nyc_taxi.csv" assert exists(file_path) file_path = "s3://analytics-zoo-data/abc.csv" assert not exists(file_path)
def test_mkdirs_local_2(self): temp = tempfile.mkdtemp() path = os.path.join(temp, "dir1") makedirs("file://" + path) assert exists("file://" + path) path = os.path.join(temp, "dir2/dir3") makedirs("file://" + path) assert exists("file://" + path) shutil.rmtree(temp)
def test_mkdirs_s3(self): access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id and secret_access_key: file_path = "s3://analytics-zoo-data/temp/abc/" makedirs(file_path) assert exists(file_path) import boto3 s3_client = boto3.Session( aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key).client('s3', verify=False) s3_client.delete_object(Bucket='analytics-zoo-data', Key='temp/abc/')
for i in range(len(list_cols)): cat_sizes_dict[list_cols[i]] = indexes[i].size() cat_sizes_dict['engaged_with_user_id'] = user_index.size() cat_sizes_dict['enaging_user_id'] = user_index.size() cross_sizes_dict = dict( zip(["_".join(cross_names) for cross_names in cross_cols], args.cross_sizes)) cat_sizes_dict.update(cross_sizes_dict) count_sizes_dict = dict(zip(count_cols, [len(bins)] * len(count_cols))) cat_sizes_dict.update(count_sizes_dict) print("cat size dict: ", cat_sizes_dict) if not exists(os.path.join(args.output_folder, "meta")): makedirs(os.path.join(args.output_folder, "meta")) with tempfile.TemporaryDirectory() as local_path: with open(os.path.join(local_path, "categorical_sizes.pkl"), 'wb') as f: pickle.dump(cat_sizes_dict, f) put_local_file_to_remote(os.path.join(local_path, "categorical_sizes.pkl"), os.path.join(args.output_folder, "meta/categorical_sizes.pkl"), over_write=True) end = time() print("Preprocessing and save time: ", end - start)
"intra_op_parallelism": 40 } est = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, backend="tf2") train_count = train_tbl.size() print("train size: ", train_count) steps = math.ceil(train_count / options.batch_size) test_count = test_tbl.size() print("test size: ", test_count) val_steps = math.ceil(test_count / options.batch_size) if not exists(options.model_dir): makedirs(options.model_dir) callbacks = [] # early stopping earlystopping = options.early_stopping if earlystopping: from tensorflow.keras.callbacks import EarlyStopping callbacks.append( EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=earlystopping))
def test_exists_local(self): file_path = os.path.join(self.resource_path, "orca/data/random.npy") assert exists("file://" + file_path) file_path = os.path.join(self.resource_path, "orca/data/abc.npy") assert not exists("file://" + file_path)
train_data, test_data, n_uid, n_mid, n_cat = load_dien_data(args.data_dir) model = build_model(args.model_type, n_uid, n_mid, n_cat, args.lr, args.data_type) [inputs, feature_cols] = align_input_features(model) estimator = Estimator.from_graph(inputs=inputs, outputs=[model.y_hat], labels=[model.target_ph], loss=model.loss, optimizer=model.optim, model_dir=args.model_dir, metrics={'loss': model.loss, 'accuracy': model.accuracy}) estimator.fit(train_data.df, epochs=args.epochs, batch_size=args.batch_size, feature_cols=feature_cols, label_cols=['label'], validation_data=test_data.df) ckpts_dir = os.path.join(args.model_dir, 'ckpts/') if not exists(ckpts_dir): makedirs(ckpts_dir) snapshot_path = ckpts_dir + "ckpt_" + args.model_type estimator.save_tf_checkpoint(snapshot_path) time_train = time.time() print(f"perf training time: {(time_train - time_start):.2f}") result = estimator.evaluate(test_data.df, args.batch_size, feature_cols=feature_cols, label_cols=['label']) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label_t", metricName="areaUnderROC") prediction_df = estimator.predict(test_data.df, feature_cols=feature_cols) prediction_df.cache() transform_label = udf(lambda x: int(x[1]), "int")
def prepare_features(train_tbl, test_tbl, reindex_tbls): def add_ratio_features(tbl): cal_ratio = (lambda x: x[1] / x[0] if x[0] > 0 else 0.0) tbl = tbl.apply(["engaged_with_user_follower_count", "engaged_with_user_following_count"], "engaged_with_user_follower_following_ratio", cal_ratio, "float")\ .apply(["enaging_user_follower_count", "enaging_user_following_count"], "enaging_user_follower_following_ratio", cal_ratio, "float") return tbl def organize_cols(tbl): tbl = tbl.select( array("enaging_user_follower_count", "enaging_user_following_count", "enaging_user_follower_following_ratio").alias("user_num"), array("len_hashtags", "len_domains", "len_links", "engaged_with_user_follower_count", "engaged_with_user_following_count", "engaged_with_user_follower_following_ratio").alias( "item_num"), *cat_cols, *embed_cols, "label") return tbl print("reindexing embedding cols") train_tbl = train_tbl.reindex(embed_cols, reindex_tbls) test_tbl = test_tbl.reindex(embed_cols, reindex_tbls) embed_in_dims = {} for i, c, in enumerate(embed_cols): embed_in_dims[c] = max(reindex_tbls[i].df.agg({ c + "_new": "max" }).collect()[0]) print("add ratio features") train_tbl = add_ratio_features(train_tbl) test_tbl = add_ratio_features(test_tbl) print("scale numerical features") train_tbl, min_max_dic = train_tbl.min_max_scale(num_cols + ratio_cols) test_tbl = test_tbl.transform_min_max_scale(num_cols + ratio_cols, min_max_dic) stats_dir = os.path.join(args.model_dir, 'stats') if not exists(stats_dir): makedirs(stats_dir) with open(os.path.join(stats_dir, "min_max.pkl"), 'wb') as f: pickle.dump(min_max_dic, f) user_col_info = ColumnInfoTower( indicator_cols=["enaging_user_is_verified"], indicator_dims=[2], embed_cols=["enaging_user_id"], embed_in_dims=[embed_in_dims["enaging_user_id"]], embed_out_dims=[16], numerical_cols=["user_num"], numerical_dims=[3], name="user") item_col_info = ColumnInfoTower( indicator_cols=[ "engaged_with_user_is_verified", "present_media", "tweet_type", "language" ], indicator_dims=[2, 13, 3, 67], # max + 1 embed_cols=[ "engaged_with_user_id", "hashtags", "present_links", "present_domains" ], embed_in_dims=[ embed_in_dims["engaged_with_user_id"], embed_in_dims["hashtags"], embed_in_dims["present_links"], embed_in_dims["present_domains"] ], embed_out_dims=[16, 16, 16, 16], numerical_cols=["item_num"], numerical_dims=[6], name="item") print("organize columns and specify user_col_info and item_col_info") train_tbl = organize_cols(train_tbl) test_tbl = organize_cols(test_tbl) return train_tbl, test_tbl, user_col_info, item_col_info