Пример #1
0
def get_size(data_dir):
    if not exists(os.path.join(data_dir, "train_parquet")) or \
            not exists(os.path.join(data_dir, "test_parquet")):
        raise Exception("Not train and test data parquet specified")
    else:
        train_tbl = FeatureTable.read_parquet(
            os.path.join(data_dir, "train_parquet"))
        test_tbl = FeatureTable.read_parquet(
            os.path.join(data_dir, "test_parquet"))

    # get cat sizes
    with tempfile.TemporaryDirectory() as local_path:
        get_remote_file_to_local(
            os.path.join(data_dir, "meta/categorical_sizes.pkl"),
            os.path.join(local_path, "categorical_sizes.pkl"))
        with open(os.path.join(local_path, "categorical_sizes.pkl"),
                  'rb') as f:
            cat_sizes_dic = pickle.load(f)

    indicator_sizes = [cat_sizes_dic[c] for c in indicator_cols]
    print("indicator sizes: ", indicator_sizes)
    embedding_sizes = [cat_sizes_dic[c] for c in embedding_cols]
    print("embedding sizes: ", embedding_sizes)
    cross_sizes = [cat_sizes_dic[c] for c in cross_cols]

    return train_tbl, test_tbl, indicator_sizes, embedding_sizes, cross_sizes
Пример #2
0
 def test_exists_s3(self):
     access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
     secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
     if access_key_id and secret_access_key:
         file_path = "s3://analytics-zoo-data/nyc_taxi.csv"
         assert exists(file_path)
         file_path = "s3://analytics-zoo-data/abc.csv"
         assert not exists(file_path)
Пример #3
0
 def test_mkdirs_local_2(self):
     temp = tempfile.mkdtemp()
     path = os.path.join(temp, "dir1")
     makedirs("file://" + path)
     assert exists("file://" + path)
     path = os.path.join(temp, "dir2/dir3")
     makedirs("file://" + path)
     assert exists("file://" + path)
     shutil.rmtree(temp)
Пример #4
0
 def test_mkdirs_s3(self):
     access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
     secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
     if access_key_id and secret_access_key:
         file_path = "s3://analytics-zoo-data/temp/abc/"
         makedirs(file_path)
         assert exists(file_path)
         import boto3
         s3_client = boto3.Session(
             aws_access_key_id=access_key_id,
             aws_secret_access_key=secret_access_key).client('s3',
                                                             verify=False)
         s3_client.delete_object(Bucket='analytics-zoo-data',
                                 Key='temp/abc/')
Пример #5
0
    for i in range(len(list_cols)):
        cat_sizes_dict[list_cols[i]] = indexes[i].size()
    cat_sizes_dict['engaged_with_user_id'] = user_index.size()
    cat_sizes_dict['enaging_user_id'] = user_index.size()

    cross_sizes_dict = dict(
        zip(["_".join(cross_names) for cross_names in cross_cols],
            args.cross_sizes))

    cat_sizes_dict.update(cross_sizes_dict)

    count_sizes_dict = dict(zip(count_cols, [len(bins)] * len(count_cols)))
    cat_sizes_dict.update(count_sizes_dict)
    print("cat size dict: ", cat_sizes_dict)

    if not exists(os.path.join(args.output_folder, "meta")):
        makedirs(os.path.join(args.output_folder, "meta"))

    with tempfile.TemporaryDirectory() as local_path:
        with open(os.path.join(local_path, "categorical_sizes.pkl"),
                  'wb') as f:
            pickle.dump(cat_sizes_dict, f)
        put_local_file_to_remote(os.path.join(local_path,
                                              "categorical_sizes.pkl"),
                                 os.path.join(args.output_folder,
                                              "meta/categorical_sizes.pkl"),
                                 over_write=True)

    end = time()
    print("Preprocessing and save time: ", end - start)
Пример #6
0
        "intra_op_parallelism": 40
    }

    est = Estimator.from_keras(model_creator=model_creator,
                               verbose=True,
                               config=config,
                               backend="tf2")

    train_count = train_tbl.size()
    print("train size: ", train_count)
    steps = math.ceil(train_count / options.batch_size)
    test_count = test_tbl.size()
    print("test size: ", test_count)
    val_steps = math.ceil(test_count / options.batch_size)

    if not exists(options.model_dir):
        makedirs(options.model_dir)

    callbacks = []

    # early stopping
    earlystopping = options.early_stopping
    if earlystopping:
        from tensorflow.keras.callbacks import EarlyStopping

        callbacks.append(
            EarlyStopping(monitor='val_auc',
                          mode='max',
                          verbose=1,
                          patience=earlystopping))
Пример #7
0
 def test_exists_local(self):
     file_path = os.path.join(self.resource_path, "orca/data/random.npy")
     assert exists("file://" + file_path)
     file_path = os.path.join(self.resource_path, "orca/data/abc.npy")
     assert not exists("file://" + file_path)
Пример #8
0
    train_data, test_data, n_uid, n_mid, n_cat = load_dien_data(args.data_dir)

    model = build_model(args.model_type, n_uid, n_mid, n_cat, args.lr, args.data_type)
    [inputs, feature_cols] = align_input_features(model)

    estimator = Estimator.from_graph(inputs=inputs, outputs=[model.y_hat],
                                     labels=[model.target_ph], loss=model.loss,
                                     optimizer=model.optim, model_dir=args.model_dir,
                                     metrics={'loss': model.loss, 'accuracy': model.accuracy})

    estimator.fit(train_data.df, epochs=args.epochs, batch_size=args.batch_size,
                  feature_cols=feature_cols, label_cols=['label'], validation_data=test_data.df)

    ckpts_dir = os.path.join(args.model_dir, 'ckpts/')
    if not exists(ckpts_dir):
        makedirs(ckpts_dir)
    snapshot_path = ckpts_dir + "ckpt_" + args.model_type
    estimator.save_tf_checkpoint(snapshot_path)
    time_train = time.time()
    print(f"perf training time: {(time_train - time_start):.2f}")

    result = estimator.evaluate(test_data.df, args.batch_size, feature_cols=feature_cols,
                                label_cols=['label'])

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                              labelCol="label_t",
                                              metricName="areaUnderROC")
    prediction_df = estimator.predict(test_data.df, feature_cols=feature_cols)
    prediction_df.cache()
    transform_label = udf(lambda x: int(x[1]), "int")
Пример #9
0
def prepare_features(train_tbl, test_tbl, reindex_tbls):
    def add_ratio_features(tbl):
        cal_ratio = (lambda x: x[1] / x[0] if x[0] > 0 else 0.0)
        tbl = tbl.apply(["engaged_with_user_follower_count", "engaged_with_user_following_count"],
                        "engaged_with_user_follower_following_ratio", cal_ratio, "float")\
            .apply(["enaging_user_follower_count", "enaging_user_following_count"],
                   "enaging_user_follower_following_ratio", cal_ratio, "float")
        return tbl

    def organize_cols(tbl):
        tbl = tbl.select(
            array("enaging_user_follower_count",
                  "enaging_user_following_count",
                  "enaging_user_follower_following_ratio").alias("user_num"),
            array("len_hashtags", "len_domains", "len_links",
                  "engaged_with_user_follower_count",
                  "engaged_with_user_following_count",
                  "engaged_with_user_follower_following_ratio").alias(
                      "item_num"), *cat_cols, *embed_cols, "label")
        return tbl

    print("reindexing embedding cols")
    train_tbl = train_tbl.reindex(embed_cols, reindex_tbls)
    test_tbl = test_tbl.reindex(embed_cols, reindex_tbls)
    embed_in_dims = {}
    for i, c, in enumerate(embed_cols):
        embed_in_dims[c] = max(reindex_tbls[i].df.agg({
            c + "_new": "max"
        }).collect()[0])

    print("add ratio features")
    train_tbl = add_ratio_features(train_tbl)
    test_tbl = add_ratio_features(test_tbl)

    print("scale numerical features")
    train_tbl, min_max_dic = train_tbl.min_max_scale(num_cols + ratio_cols)
    test_tbl = test_tbl.transform_min_max_scale(num_cols + ratio_cols,
                                                min_max_dic)

    stats_dir = os.path.join(args.model_dir, 'stats')
    if not exists(stats_dir):
        makedirs(stats_dir)
    with open(os.path.join(stats_dir, "min_max.pkl"), 'wb') as f:
        pickle.dump(min_max_dic, f)

    user_col_info = ColumnInfoTower(
        indicator_cols=["enaging_user_is_verified"],
        indicator_dims=[2],
        embed_cols=["enaging_user_id"],
        embed_in_dims=[embed_in_dims["enaging_user_id"]],
        embed_out_dims=[16],
        numerical_cols=["user_num"],
        numerical_dims=[3],
        name="user")
    item_col_info = ColumnInfoTower(
        indicator_cols=[
            "engaged_with_user_is_verified", "present_media", "tweet_type",
            "language"
        ],
        indicator_dims=[2, 13, 3, 67],  # max + 1
        embed_cols=[
            "engaged_with_user_id", "hashtags", "present_links",
            "present_domains"
        ],
        embed_in_dims=[
            embed_in_dims["engaged_with_user_id"], embed_in_dims["hashtags"],
            embed_in_dims["present_links"], embed_in_dims["present_domains"]
        ],
        embed_out_dims=[16, 16, 16, 16],
        numerical_cols=["item_num"],
        numerical_dims=[6],
        name="item")

    print("organize columns and specify user_col_info and item_col_info")
    train_tbl = organize_cols(train_tbl)
    test_tbl = organize_cols(test_tbl)

    return train_tbl, test_tbl, user_col_info, item_col_info