Exemplo n.º 1
0
def transformer(
    brokers: str,
    topic_request: str,
    topic_response: str,
    feature_set: str,
) -> mt.Transformer:
    """Collect feature vectors for the provided query/page pairs

    Parameters
    ----------
    brokers :
        Comma separated list of kafka hosts to bootstrap from.
    topic_request :
        Kafka topic to send feature vector requests on.
    topic_response :
        Kafka topic to recieve feature vector responses on.
    feature_set :
        A named elasticsearch ltr featureset to collect features from.

    Returns
    -------
    A Transformer accepting mt.QueryPage and returning mt.FeatureVectors.
    """
    kafka_config = ClientConfig(brokers, topic_request, topic_response,
                                mjolnir.kafka.TOPIC_COMPLETE)
    return mt.seq_transform([
        # TODO: Rename cols upstream in mjolnir
        mt.temp_rename_col('page_id', 'hit_page_id',
                           collect_features(kafka_config, feature_set)),
        lambda df: df.select('wikiid', 'query', 'page_id', 'features'),
        # TODO: Should write_partition also correctly partition all
        # our datasets by (wikiid, query)? This would help joins against
        # the data not require a shuffle.
        lambda df: df.repartition(200, 'wikiid', 'query')
    ])
Exemplo n.º 2
0
def transformer(df_cluster: DataFrame) -> mt.Transformer:
    mt.check_schema(df_cluster, mt.QueryClustering)
    return mt.seq_transform([
        # Attach cluster id's to search queries
        # TODO: Stats about # of queries that didn't have a cluster id and were discarded
        mt.join_cluster_by_query(df_cluster),
        # Massage into dbn expected format
        with_exploded_hits,
        # Run the labeling process
        mt.temp_rename_col(
            'cluster_id', 'norm_query_id',
            as_labeled_clusters({
                'MIN_DOCS_PER_QUERY': 10,
                'MAX_DOCS_PER_QUERY': 20,
                'DEFAULT_REL': 0.5,
                'MAX_ITERATIONS': 40,
                'GAMMA': 0.9,
            })),
        # labeling gave results per cluster_id, transform into results per
        # query.  This ends up labeling all queries in df_cluster, rather than
        # the dataframe being transformed. For standard usage where the
        # dataframe being transformed is the same as the dataframe that was
        # input to clustering this is acceptable, but if the join filters a lot
        # of things out the results might be unexpected.
        mt.join_cluster_by_cluster_id(df_cluster),
        # Rename things to match our output tables
        # TODO: Rename upstream in mjolnir
        lambda df: df.select(
            'wikiid', 'query', F.col('hit_page_id').alias('page_id'),
            'label', 'cluster_id'),
        # TODO: Any interesting metadata to attach to `label` column? The config passed
        # to as_labeled_clusters?
        lambda df: df.repartition(200, 'wikiid', 'query')
    ])
Exemplo n.º 3
0
def convert_mllib_to_svmrank_and_xgboost(path_format: str,
                                         fold_col: Optional[str],
                                         num_folds: int) -> mt.Transformer:
    return mt.seq_transform([
        convert_mllib_to_svmrank(path_format, fold_col, num_folds),
        convert_svmrank_to_xgboost,
    ])
def transformer(
    df_label: DataFrame,
    temp_dir: str,
    wikis: List[str],
    num_features: int
) -> mt.Transformer:
    mt.check_schema(df_label, mt.LabeledQueryPage)

    # Hack to transfer metadata between transformations. This is populated in
    # time since `select_features` does direct computation of the features.
    metadata = cast(Dict, {'wiki_features': {}})

    return mt.seq_transform([
        mt.restrict_wikis(wikis),
        mt.join_labels(df_label),
        explode_features(metadata),
        mt.cache_to_disk(temp_dir, partition_by='wikiid'),
        mt.for_each_item('wikiid', wikis, lambda wiki: select_features(
            wiki, num_features, metadata)),
        attach_feature_metadata(metadata),
        # While we used the labels for selecting features, they are not part of the feature vectors.
        # Allow them to be joined with any other label set for export to training.
        lambda df: df.drop('cluster_id', 'label'),
        lambda df: df.repartition(200, 'wikiid', 'query'),
    ])
Exemplo n.º 5
0
def transformer(df_label: DataFrame, wiki: str, output_path: str,
                num_folds: int) -> mt.Transformer:
    mt.check_schema(df_label, mt.LabeledQueryPage)

    fold_col = 'fold'
    # Format for individual training files. First %s is split name, second
    # is the fold id (a 0 indexed number, or `x` for un-folded)
    path_format = os.path.join(output_path, wiki + '.%s.f%s')

    # This pool should be sized to run all possible tasks, currently two
    # (folded and unfolded). We shouldn't be limiting any concurrency here.
    task_pool = multiprocessing.dummy.Pool(2)
    restrict_wiki = mt.restrict_wikis([wiki])

    return mt.seq_transform([
        restrict_wiki,
        mt.assert_not_empty,
        mt.join_labels(restrict_wiki(df_label)),
        # TODO: hardcoded assumption about DBN, labels
        # could be on a variety of scales. Maybe metadata could
        # be attached to label col to inform this?
        frac_label_to_int('label'),
        # hardcoded assumption that labels with same cluster_id
        # are not independent and must be in the same fold.
        attach_fold_col(num_folds, fold_col),
        partition_and_order_for_output,
        # This may recalculate the above per output, but folds were
        # calculated on the driver ensuring those will stay constant.
        # Everything else is preferrable to recalculate rather than
        # having many executors cache it in memory while 1 executor
        # spends 20 minutes writing out datasets.
        mt.par_transform(
            [
                # Write out folded dataset
                convert_mllib_to_svmrank_and_xgboost(path_format, fold_col,
                                                     num_folds),
                # Write out unfolded "all" dataset. The resulting rows
                # are distinguished from above with `split_name` of all.
                convert_mllib_to_svmrank_and_xgboost(
                    path_format, fold_col=None, num_folds=1),
            ],
            mapper=task_pool.imap_unordered),
        lambda df: df.withColumn('wikiid', F.lit(wiki)),
        # After the above we have a row per training file, with most
        # data represented externally via hdfs paths. No need for
        # multiple partitions.
        lambda df: df.repartition(1),
    ])
Exemplo n.º 6
0
def transformer(
    brokers: str, topic_request: str, topic_response: str,
    top_n: int, min_sessions_per_query: int
) -> mt.Transformer:
    kafka_config = ClientConfig(
        brokers, topic_request, topic_response,
        mjolnir.kafka.TOPIC_COMPLETE)
    return mt.seq_transform([
        with_norm_query,
        filter_min_sessions_per_norm_query(min_sessions_per_query),
        as_unique_queries,
        with_hit_page_ids(kafka_config, top_n),
        cluster_within_norm_query_groups,
        with_unique_cluster_id,
        lambda df: df.repartition(200, 'wikiid', 'query')
    ])
Exemplo n.º 7
0
def transformer(max_q_by_day: int) -> mt.Transformer:
    return mt.seq_transform([
        filter_high_volume_ip(max_q_by_day),
        with_page_ids,
        lambda df: df.repartition(200, 'wikiid', 'query')
    ])