示例#1
0
def get_size(data_dir):
    if not exists(os.path.join(data_dir, "train_parquet")) or \
            not exists(os.path.join(data_dir, "test_parquet")):
        raise Exception("Not train and test data parquet specified")
    else:
        train_tbl = FeatureTable.read_parquet(
            os.path.join(data_dir, "train_parquet"))
        test_tbl = FeatureTable.read_parquet(
            os.path.join(data_dir, "test_parquet"))

    # get cat sizes
    with tempfile.TemporaryDirectory() as local_path:
        get_remote_file_to_local(
            os.path.join(data_dir, "meta/categorical_sizes.pkl"),
            os.path.join(local_path, "categorical_sizes.pkl"))
        with open(os.path.join(local_path, "categorical_sizes.pkl"),
                  'rb') as f:
            cat_sizes_dic = pickle.load(f)

    indicator_sizes = [cat_sizes_dic[c] for c in indicator_cols]
    print("indicator sizes: ", indicator_sizes)
    embedding_sizes = [cat_sizes_dic[c] for c in embedding_cols]
    print("embedding sizes: ", embedding_sizes)
    cross_sizes = [cat_sizes_dic[c] for c in cross_cols]

    return train_tbl, test_tbl, indicator_sizes, embedding_sizes, cross_sizes
示例#2
0
def load_dien_data(data_dir):
    tbl = FeatureTable.read_parquet(data_dir + "/data")
    windowSpec1 = Window.partitionBy("user").orderBy(desc("time"))
    tbl = tbl.append_column("rank1", rank().over(windowSpec1))
    tbl = tbl.filter(col('rank1') == 1)
    train_data, test_data = tbl.split([0.8, 0.2], seed=1)
    usertbl = FeatureTable.read_parquet(data_dir + "/user_index/*")
    itemtbl = FeatureTable.read_parquet(data_dir + "/item_index/*")
    cattbl = FeatureTable.read_parquet(data_dir + "/category_index/*")
    n_uid = usertbl.get_stats("id", "max")["id"] + 1
    n_mid = itemtbl.get_stats("id", "max")["id"] + 1
    n_cat = cattbl.get_stats("id", "max")["id"] + 1
    train_data.show()
    print("train size: ", train_data.size())
    print("test size: ", test_data.size())
    print("user size: ", n_uid)
    print("item size: ", n_mid)
    return train_data, test_data, n_uid, n_mid, n_cat
示例#3
0
                          conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client",
                          cores=args.executor_cores,
                          num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)

    start = time()
    train_paths = [
        os.path.join(args.input_train_folder, 'part-%05d.parquet' % i)
        for i in args.train_files
    ]
    train_tbl = FeatureTable.read_parquet(train_paths)
    train_tbl.df.printSchema()

    test_tbl = FeatureTable.read_parquet(args.input_test_folder)

    train_tbl = preprocess(train_tbl)
    test_tbl = preprocess(test_tbl)

    train_tbl, language_idx = train_tbl.category_encode("language")
    test_tbl = test_tbl.encode_string("language", language_idx)

    user_index = train_tbl.gen_string_idx({
        'src_cols': ['engaged_with_user_id', 'enaging_user_id'],
        'col_name':
        'user_id'
    })
示例#4
0
                               driver_cores=args.driver_cores, driver_memory=args.driver_memory,
                               init_ray_on_spark=True)
    elif args.cluster_mode == "yarn":
        sc = init_orca_context("yarn-client", cores=args.executor_cores,
                               num_nodes=args.num_executor, memory=args.executor_memory,
                               driver_cores=args.driver_cores, driver_memory=args.driver_memory,
                               object_store_memory="10g",
                               init_ray_on_spark=True)
    elif args.cluster_mode == "spark-submit":
        sc = init_orca_context("spark-submit")

    movielens_data = movielens.get_id_ratings("/tmp/movielens/")
    pddf = pd.DataFrame(movielens_data, columns=["user", "item", "label"])
    num_users, num_items = pddf["user"].max() + 1, pddf["item"].max() + 1

    full = FeatureTable.from_pandas(pddf)\
        .apply("label", "label", lambda x: x - 1, 'int')
    train, test = full.random_split([0.8, 0.2], seed=1)

    config = {"lr": 1e-3, "inter_op_parallelism": 4, "intra_op_parallelism": args.executor_cores}

    def model_creator(config):
        model = build_model(num_users, num_items, 5)
        print(model.summary())
        optimizer = tf.keras.optimizers.Adam(config["lr"])
        model.compile(optimizer=optimizer,
                      loss='sparse_categorical_crossentropy',
                      metrics=['sparse_categorical_crossentropy', 'accuracy'])
        return model

    steps_per_epoch = math.ceil(train.size() / args.batch_size)
    val_steps = math.ceil(test.size() / args.batch_size)
示例#5
0
                          memory=args.executor_memory)
    elif args.cluster_mode == "standalone":
        init_orca_context("standalone",
                          master=args.master,
                          cores=args.executor_cores,
                          num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client",
                          cores=args.executor_cores,
                          num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)

    start = time()
    val_tbl = FeatureTable.read_csv(args.input_file,
                                    delimiter="\x01",
                                    names=RecsysSchema().toColumns(),
                                    dtype=RecsysSchema().toDtype())
    val_tbl.df.printSchema()
    val_tbl.write_parquet(args.output_folder)

    end = time()
    print("Convert to parquet time: ", end - start)
    stop_orca_context()
示例#6
0
                          cores=args.cores,
                          num_nodes=args.num_nodes,
                          memory=args.memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)
    elif args.cluster_mode == "spark-submit":
        init_orca_context("spark-submit")

    time_start = time()

    paths = [
        os.path.join(args.input_folder, "day_%d.parquet" % i)
        for i in args.day_range
    ]
    tbl = FeatureTable.read_parquet(paths)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)

    if args.days == 24:  # Full Criteo dataset
        # Exclude the last path day_23.parquet since the first half of day_23 is separated for test.
        train_data = FeatureTable.read_parquet(paths[:-1])
        train_preprocessed = preprocess_and_save(train_data, idx_list, "train",
                                                 args.output_folder)
        test_data = FeatureTable.read_parquet(
            os.path.join(args.input_folder, "day_23_test.parquet"))
        test_preprocessed = preprocess_and_save(test_data, idx_list, "test",
                                                args.output_folder)
    else:
        train_data = FeatureTable.read_parquet(paths)
        train_preprocessed = preprocess_and_save(train_data, idx_list, "train",
                                                 args.output_folder)
示例#7
0
        "len_links", "hashtags", "present_links", "present_domains"
    ]
    cat_cols = [
        "engaged_with_user_is_verified", "enaging_user_is_verified",
        "present_media", "tweet_type", "language"
    ]
    ratio_cols = [
        "engaged_with_user_follower_following_ratio",
        "enaging_user_follower_following_ratio"
    ]
    embed_cols = [
        "enaging_user_id", "engaged_with_user_id", "hashtags", "present_links",
        "present_domains"
    ]
    useful_cols = num_cols + cat_cols + embed_cols
    train_tbl = FeatureTable.read_parquet(args.data_dir + "/train_parquet")
    test_tbl = FeatureTable.read_parquet(args.data_dir + "/test_parquet")
    full_tbl = train_tbl.concat(test_tbl, "outer")
    reindex_tbls = full_tbl.gen_reindex_mapping(
        embed_cols, freq_limit=args.frequency_limit)
    train_tbl, test_tbl, user_info, item_info = prepare_features(
        train_tbl, test_tbl, reindex_tbls)

    output_dir = args.data_dir + "/embed_reindex"
    for i, c in enumerate(embed_cols):
        reindex_tbls[i].write_parquet(output_dir + "_" + c)

    train_config = {
        "lr": 1e-3,
        "user_col_info": user_info,
        "item_col_info": item_info,
示例#8
0
                          driver_memory=args.driver_memory,
                          conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client",
                          cores=args.executor_cores,
                          num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)
    elif args.cluster_mode == "spark-submit":
        init_orca_context("spark-submit")

    begin = time.time()
    transaction_tbl = FeatureTable.read_json(args.input_transaction).select(
        ['reviewerID', 'asin', 'unixReviewTime']) \
        .rename({'reviewerID': 'user', 'asin': 'item', 'unixReviewTime': 'time'}) \
        .dropna(columns=['user', 'item'])
    transaction_tbl.cache()
    print("transaction_tbl, ", transaction_tbl.size())

    item_tbl = FeatureTable.read_csv(args.input_meta, delimiter="\t", names=['item', 'category'])\
        .apply("category", "category", lambda x: x.lower() if x is not None else "default")
    item_tbl.cache()
    print("item_tbl, ", item_tbl.size())

    user_index = transaction_tbl.gen_string_idx('user', freq_limit=1)
    item_cat_indices = item_tbl.gen_string_idx(["item", "category"],
                                               freq_limit=1)
    item_size = item_cat_indices[0].size()

    item_tbl = item_tbl\