예제 #1
0
    def __rshift__(self, operator):
        if callable(operator) and not (isinstance(operator, type)
                                       and issubclass(operator, Operator)):
            # implicit lambdaop conversion.
            operator = LambdaOp(operator)

        return super().__rshift__(operator)
예제 #2
0
    def __rshift__(self, operator):
        """Transforms this ColumnGroup by applying an Operator

        Parameters
        -----------
        operators: Operator or callable

        Returns
        -------
        ColumnGroup
        """
        if isinstance(operator, type) and issubclass(operator, Operator):
            # handle case where an operator class is passed
            operator = operator()
        elif callable(operator):
            # implicit lambdaop conversion.
            operator = LambdaOp(operator)

        if not isinstance(operator, Operator):
            raise ValueError(
                f"Expected operator or callable, got {operator.__class__}")

        child = ColumnGroup(operator.output_column_names(self.columns))
        child.parents = [self]
        self.children.append(child)
        child.op = operator

        dependencies = operator.dependencies()
        if dependencies:
            child.dependencies = set()
            if not isinstance(dependencies, collections.abc.Sequence):
                dependencies = [dependencies]

            for dependency in dependencies:
                if not isinstance(dependency, ColumnGroup):
                    dependency = ColumnGroup(dependency)
                dependency.children.append(child)
                child.parents.append(dependency)
                child.dependencies.add(dependency)

        return child
예제 #3
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
예제 #4
0
def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec,
                    devices, local_directory):
    rmm.reinitialize(managed_memory=False)
    documents_categories_path = os.path.join(data_bucket_folder,
                                             'documents_categories.csv')
    documents_topics_path = os.path.join(data_bucket_folder,
                                         'documents_topics.csv')
    documents_entities_path = os.path.join(data_bucket_folder,
                                           'documents_entities.csv')

    documents_categories_cudf = cudf.read_csv(documents_categories_path)
    documents_topics_cudf = cudf.read_csv(documents_topics_path)
    documents_entities_cudf = cudf.read_csv(documents_entities_path)
    documents_entities_cudf['entity_id'] = documents_entities_cudf[
        'entity_id'].astype('category').cat.codes

    categories = _df_to_coo(documents_categories_cudf, col='category_id')
    topics = _df_to_coo(documents_topics_cudf, col='topic_id')
    entities = _df_to_coo(documents_entities_cudf, col='entity_id')

    del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
    ctr_thresh = {
        'ad_id': 5,
        'source_id_promo': 10,
        'publisher_id_promo': 10,
        'advertiser_id': 10,
        'campaign_id': 10,
        'document_id_promo': 5,
    }

    client = create_client(devices=devices, local_directory=local_directory)

    workflow = nvt.Workflow(cat_names=CATEGORICAL_COLUMNS,
                            cont_names=CONTINUOUS_COLUMNS,
                            label_name=['clicked'],
                            client=client)

    workflow.add_feature([
        LambdaOp(op_name='country',
                 f=lambda col, gdf: col.str.slice(0, 2),
                 columns=['geo_location'],
                 replace=False),
        LambdaOp(op_name='state',
                 f=lambda col, gdf: col.str.slice(0, 5),
                 columns=['geo_location'],
                 replace=False),
        LambdaOp(op_name='days_since_published',
                 f=_calculate_delta,
                 columns=['publish_time', 'publish_time_promo'],
                 replace=False),
        FillMedian(columns=[
            'publish_time_days_since_published',
            'publish_time_promo_days_since_published'
        ]),
        JoinGroupby(columns=[
            'ad_id', 'source_id_promo', 'document_id_promo',
            'publisher_id_promo', 'advertiser_id', 'campaign_id'
        ],
                    cont_names=['clicked'],
                    out_path=output_bucket_folder,
                    stats=['sum', 'count']),
        LambdaOp(op_name='ctr',
                 f=lambda col, gdf:
                 ((col) /
                  (gdf[col.name.replace('_clicked_sum', '_count')])).where(
                      gdf[col.name.replace('_clicked_sum', '_count')] >=
                      ctr_thresh[col.name.replace('_clicked_sum', '')], 0),
                 columns=[
                     'ad_id_clicked_sum', 'source_id_promo_clicked_sum',
                     'document_id_promo_clicked_sum',
                     'publisher_id_promo_clicked_sum',
                     'advertiser_id_clicked_sum', 'campaign_id_clicked_sum'
                 ],
                 replace=False),
        FillMissing(columns=groupby_columns + ctr_columns),
        LogOp(columns=groupby_columns + [
            'publish_time_days_since_published',
            'publish_time_promo_days_since_published'
        ]),
        Normalize(columns=groupby_columns),
        ColumnSimilarity('doc_event_doc_ad_sim_categories',
                         'document_id',
                         categories,
                         'document_id_promo',
                         metric='tfidf',
                         on_device=False),
        ColumnSimilarity('doc_event_doc_ad_sim_topics',
                         'document_id',
                         topics,
                         'document_id_promo',
                         metric='tfidf',
                         on_device=False),
        ColumnSimilarity('doc_event_doc_ad_sim_entities',
                         'document_id',
                         entities,
                         'document_id_promo',
                         metric='tfidf',
                         on_device=False)
    ])

    workflow.add_cat_preprocess([HashBucket(hash_spec)])
    workflow.finalize()

    return workflow
예제 #5
0
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory,
                    dask):
    rmm.reinitialize(managed_memory=False)
    documents_categories_path = os.path.join(data_bucket_folder,
                                             "documents_categories.csv")
    documents_topics_path = os.path.join(data_bucket_folder,
                                         "documents_topics.csv")
    documents_entities_path = os.path.join(data_bucket_folder,
                                           "documents_entities.csv")

    documents_categories_cudf = cudf.read_csv(documents_categories_path)
    documents_topics_cudf = cudf.read_csv(documents_topics_path)
    documents_entities_cudf = cudf.read_csv(documents_entities_path)
    documents_entities_cudf["entity_id"] = (
        documents_entities_cudf["entity_id"].astype("category").cat.codes)

    categories = _df_to_coo(documents_categories_cudf, col="category_id")
    topics = _df_to_coo(documents_topics_cudf, col="topic_id")
    entities = _df_to_coo(documents_entities_cudf, col="entity_id")

    del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
    ctr_thresh = {
        "ad_id": 5,
        "source_id_promo": 10,
        "publisher_id_promo": 10,
        "advertiser_id": 10,
        "campaign_id": 10,
        "document_id_promo": 5,
    }

    ctr_inputs = ColumnGroup(CTR_INPUTS)
    cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)

    geo_location = ColumnGroup(["geo_location"])
    country = (geo_location >>
               (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country"))
    state = (geo_location >>
             (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state"))
    geo_features = geo_location + country + state

    dates = ["publish_time", "publish_time_promo"]
    date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp

    stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    ctr_cols = (stat_cols - [
        column + "_count" for column in ctr_inputs.flattened_columns
    ] >> LambdaOp(
        f=lambda col, gdf:
        ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where(
            gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[
                col.name.replace("_clicked_sum", "")],
            0,
        ),
        dependency=stat_cols -
        [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
    ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr")))

    stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
    ctr_cols = ctr_cols >> FillMissing()

    cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)

    features = (date_features + ctr_cols + stat_cols + cat_cols +
                ["clicked", "display_id"])
    sim_features_categ = (
        [["document_id", "document_id_promo"]] >> ColumnSimilarity(
            categories, metric="tfidf", on_device=False) >>
        Rename(postfix="_categories"))
    sim_features_topics = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(topics, metric="tfidf",
                               on_device=False) >> Rename(postfix="_topics"))
    sim_features_entities = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(entities, metric="tfidf",
                               on_device=False) >> Rename(postfix="_entities"))
    sim_features = sim_features_categ + sim_features_topics + sim_features_entities

    client = create_client(devices=devices,
                           local_directory=local_directory) if dask else None

    workflow = nvt.Workflow(column_group=features + sim_features,
                            client=client)

    return workflow
def preprocess_criteo_parquet(
    input_path: str,
    output_path: str,
    client,
    frequency_threshold: int,
):
    train_days = [str(x) for x in CRITEO_TRAIN_DAYS]
    train_files = [
        os.path.join(input_path, x) for x in os.listdir(input_path)
        if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
    ]
    valid_file = os.path.join(input_path, "day_23.part2.parquet")
    test_file = os.path.join(input_path, "day_23.part1.parquet")

    all_set = train_files + [valid_file] + [test_file]

    print(all_set, train_files, valid_file, test_file)
    print("Creating Workflow Object")

    workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS,
                        cont_names=CRITEO_CONTINUOUS_COLUMNS,
                        label_name=CRITEO_CLICK_COLUMNS)

    # We want to assign 0 to all missing values, and calculate log(x+3) for present values
    # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0
    workflow.add_cont_feature([
        FillMissing(fill_val=-2.0),
        LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne',
                 f=lambda col, _: col.add(2.0)),
        LogOp(),  # Log(1+x)
    ])

    workflow.add_cat_preprocess(
        Categorify(freq_threshold=frequency_threshold, out_path=output_path))

    workflow.finalize()

    print("Creating Dataset Iterator")
    all_ds = Dataset(all_set,
                     engine="parquet",
                     part_mem_fraction=ALL_DS_MEM_FRAC)
    trains_ds = Dataset(train_files,
                        engine="parquet",
                        part_mem_fraction=TRAIN_DS_MEM_FRAC)
    valid_ds = Dataset(valid_file,
                       engine="parquet",
                       part_mem_fraction=TEST_DS_MEM_FRAC)
    test_ds = Dataset(test_file,
                      engine="parquet",
                      part_mem_fraction=VALID_DS_MEM_FRAC)

    print("Running apply")
    out_train = os.path.join(output_path, "train")
    out_valid = os.path.join(output_path, "validation")
    out_test = os.path.join(output_path, "test")

    start = time()
    workflow.update_stats(all_ds)
    print(f"Gathering statistics time: {time() - start}")

    start = time()
    workflow.apply(trains_ds, record_stats=False, output_path=out_train)
    print(f"train preprocess time: {time() - start}")

    start = time()
    workflow.apply(valid_ds, record_stats=False, output_path=out_valid)
    print(f"valid preprocess time: {time() - start}")

    start = time()
    workflow.apply(test_ds, record_stats=False, output_path=out_test)
    print(f"test preprocess time: {time() - start}")

    save_model_size_config(workflow, output_path)