def transformer( brokers: str, topic_request: str, topic_response: str, feature_set: str, ) -> mt.Transformer: """Collect feature vectors for the provided query/page pairs Parameters ---------- brokers : Comma separated list of kafka hosts to bootstrap from. topic_request : Kafka topic to send feature vector requests on. topic_response : Kafka topic to recieve feature vector responses on. feature_set : A named elasticsearch ltr featureset to collect features from. Returns ------- A Transformer accepting mt.QueryPage and returning mt.FeatureVectors. """ kafka_config = ClientConfig(brokers, topic_request, topic_response, mjolnir.kafka.TOPIC_COMPLETE) return mt.seq_transform([ # TODO: Rename cols upstream in mjolnir mt.temp_rename_col('page_id', 'hit_page_id', collect_features(kafka_config, feature_set)), lambda df: df.select('wikiid', 'query', 'page_id', 'features'), # TODO: Should write_partition also correctly partition all # our datasets by (wikiid, query)? This would help joins against # the data not require a shuffle. lambda df: df.repartition(200, 'wikiid', 'query') ])
def transformer(df_cluster: DataFrame) -> mt.Transformer: mt.check_schema(df_cluster, mt.QueryClustering) return mt.seq_transform([ # Attach cluster id's to search queries # TODO: Stats about # of queries that didn't have a cluster id and were discarded mt.join_cluster_by_query(df_cluster), # Massage into dbn expected format with_exploded_hits, # Run the labeling process mt.temp_rename_col( 'cluster_id', 'norm_query_id', as_labeled_clusters({ 'MIN_DOCS_PER_QUERY': 10, 'MAX_DOCS_PER_QUERY': 20, 'DEFAULT_REL': 0.5, 'MAX_ITERATIONS': 40, 'GAMMA': 0.9, })), # labeling gave results per cluster_id, transform into results per # query. This ends up labeling all queries in df_cluster, rather than # the dataframe being transformed. For standard usage where the # dataframe being transformed is the same as the dataframe that was # input to clustering this is acceptable, but if the join filters a lot # of things out the results might be unexpected. mt.join_cluster_by_cluster_id(df_cluster), # Rename things to match our output tables # TODO: Rename upstream in mjolnir lambda df: df.select( 'wikiid', 'query', F.col('hit_page_id').alias('page_id'), 'label', 'cluster_id'), # TODO: Any interesting metadata to attach to `label` column? The config passed # to as_labeled_clusters? lambda df: df.repartition(200, 'wikiid', 'query') ])
def convert_mllib_to_svmrank_and_xgboost(path_format: str, fold_col: Optional[str], num_folds: int) -> mt.Transformer: return mt.seq_transform([ convert_mllib_to_svmrank(path_format, fold_col, num_folds), convert_svmrank_to_xgboost, ])
def transformer( df_label: DataFrame, temp_dir: str, wikis: List[str], num_features: int ) -> mt.Transformer: mt.check_schema(df_label, mt.LabeledQueryPage) # Hack to transfer metadata between transformations. This is populated in # time since `select_features` does direct computation of the features. metadata = cast(Dict, {'wiki_features': {}}) return mt.seq_transform([ mt.restrict_wikis(wikis), mt.join_labels(df_label), explode_features(metadata), mt.cache_to_disk(temp_dir, partition_by='wikiid'), mt.for_each_item('wikiid', wikis, lambda wiki: select_features( wiki, num_features, metadata)), attach_feature_metadata(metadata), # While we used the labels for selecting features, they are not part of the feature vectors. # Allow them to be joined with any other label set for export to training. lambda df: df.drop('cluster_id', 'label'), lambda df: df.repartition(200, 'wikiid', 'query'), ])
def transformer(df_label: DataFrame, wiki: str, output_path: str, num_folds: int) -> mt.Transformer: mt.check_schema(df_label, mt.LabeledQueryPage) fold_col = 'fold' # Format for individual training files. First %s is split name, second # is the fold id (a 0 indexed number, or `x` for un-folded) path_format = os.path.join(output_path, wiki + '.%s.f%s') # This pool should be sized to run all possible tasks, currently two # (folded and unfolded). We shouldn't be limiting any concurrency here. task_pool = multiprocessing.dummy.Pool(2) restrict_wiki = mt.restrict_wikis([wiki]) return mt.seq_transform([ restrict_wiki, mt.assert_not_empty, mt.join_labels(restrict_wiki(df_label)), # TODO: hardcoded assumption about DBN, labels # could be on a variety of scales. Maybe metadata could # be attached to label col to inform this? frac_label_to_int('label'), # hardcoded assumption that labels with same cluster_id # are not independent and must be in the same fold. attach_fold_col(num_folds, fold_col), partition_and_order_for_output, # This may recalculate the above per output, but folds were # calculated on the driver ensuring those will stay constant. # Everything else is preferrable to recalculate rather than # having many executors cache it in memory while 1 executor # spends 20 minutes writing out datasets. mt.par_transform( [ # Write out folded dataset convert_mllib_to_svmrank_and_xgboost(path_format, fold_col, num_folds), # Write out unfolded "all" dataset. The resulting rows # are distinguished from above with `split_name` of all. convert_mllib_to_svmrank_and_xgboost( path_format, fold_col=None, num_folds=1), ], mapper=task_pool.imap_unordered), lambda df: df.withColumn('wikiid', F.lit(wiki)), # After the above we have a row per training file, with most # data represented externally via hdfs paths. No need for # multiple partitions. lambda df: df.repartition(1), ])
def transformer( brokers: str, topic_request: str, topic_response: str, top_n: int, min_sessions_per_query: int ) -> mt.Transformer: kafka_config = ClientConfig( brokers, topic_request, topic_response, mjolnir.kafka.TOPIC_COMPLETE) return mt.seq_transform([ with_norm_query, filter_min_sessions_per_norm_query(min_sessions_per_query), as_unique_queries, with_hit_page_ids(kafka_config, top_n), cluster_within_norm_query_groups, with_unique_cluster_id, lambda df: df.repartition(200, 'wikiid', 'query') ])
def transformer(max_q_by_day: int) -> mt.Transformer: return mt.seq_transform([ filter_high_volume_ip(max_q_by_day), with_page_ids, lambda df: df.repartition(200, 'wikiid', 'query') ])