def split(intersections, csv_report): logging.info('Splitting started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark initialization') sc = SparkContext(cfg.spark.master, 'map_test: split') sqlContext = SQLContext(sc) logging.info('Source file reading') df = sqlContext.read.json(cfg.splitting.source_file) df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC")) users_with_event_count = df.groupBy( F.col("entityId").alias("user")).count() logging.info('Filter users with small number of events') min_events = 10 users_with_few_events = (users_with_event_count.filter( "count < %d" % (min_events)).select( F.col("user").alias("user_with_few_events"))) ndf = df.join(users_with_few_events, F.col("entityId") == F.col("user_with_few_events"), how="left_outer") df1 = ndf.filter("user_with_few_events is NULL").drop( "user_with_few_events") logging.info('Split data into train and test') train_df, test_df = split_data(df) train_df.write.json(cfg.splitting.train_file, mode="overwrite") test_df.write.json(cfg.splitting.test_file, mode="overwrite") train_df = train_df.select("entityId", "event", "targetEntityId").cache() test_df = test_df.select("entityId", "event", "targetEntityId").cache() logging.info('Calculation of different stat metrics of datasets') events_by_type = (df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_total")).toPandas()) events_by_type_test = (test_df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_test")).toPandas().set_index("event")) events_by_type_train = (train_df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_train")).toPandas().set_index("event")) unique_users_by_event = (df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_total")).toPandas().set_index( "event")) unique_users_by_event_train = (train_df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_train")).toPandas().set_index( "event")) unique_users_by_event_test = (test_df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_test")).toPandas().set_index( "event")) unique_items_by_event = (df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_total")).toPandas().set_index( "event")) unique_items_by_event_train = (train_df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_train")).toPandas().set_index( "event")) unique_items_by_event_test = (test_df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_test")).toPandas().set_index( "event")) logging.info('Calculate total counts') events = df.count() events_train = train_df.count() events_test = test_df.count() unique_users = df.select("entityId").distinct().count() unique_users_train = train_df.select("entityId").distinct().count() unique_users_test = test_df.select("entityId").distinct().count() unique_items = df.select(F.col("targetEntityId")).distinct().count() unique_items_train = train_df.select( F.col("targetEntityId")).distinct().count() unique_items_test = test_df.select( F.col("targetEntityId")).distinct().count() info_df = events_by_type dfs = [ unique_users_by_event, unique_items_by_event, events_by_type_train, events_by_type_test, unique_users_by_event_train, unique_users_by_event_test, unique_items_by_event_train, unique_items_by_event_test ] for data_frame in dfs: info_df = info_df.join(data_frame, on="event") n_rows, n_cols = info_df.shape # totals info_df.loc[n_rows] = [ 'ANY EVENT', events, unique_users, unique_items, events_train, events_test, unique_users_train, unique_users_test, unique_items_train, unique_items_test ] info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2]) info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3]) logging.info('Create event stat worksheet') reporter.start_new_sheet('Events stat') reporter.report([ 'event', 'event count', 'unique users', 'unique items', 'events per user', 'events per item', 'event count train', 'event count test', 'unique users train', 'unique users test', 'unique items train', 'unique items test' ], [column.tolist() for _, column in info_df.iteritems()], selected_rows=[ next(info_df.iteritems())[1].tolist().index( cfg.testing.primary_event) ], cfg=cfg) reporter.finish_sheet() if intersections: logging.info('Start intersections calculation') reporter.start_new_sheet('Intersections') columns_for_matrix = cfg.testing.events logging.info('Process train / train user intersection') train_train_users = (train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_left")).distinct().join( train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtru = mk_intersection_matrix(train_train_users, columns_for_matrix) reporter.report([''] + list(trtru.columns.values), [trtru.index.tolist()] + [column for _, column in trtru.iteritems()], title='Train / train user intersection') logging.info('Process train / test user intersection') train_test_users = (train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_left")).distinct().join( test_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report([''] + list(trtsu.columns.values), [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()], title='Train / test user intersection') logging.info('Process train / train item intersection') train_train_items = (train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")).distinct().join( train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtri = mk_intersection_matrix(train_train_items, columns_for_matrix) reporter.report([''] + list(trtri.columns.values), [trtri.index.tolist()] + [column for _, column in trtri.iteritems()], title='Train / train item intersection') logging.info('Process train / test item intersection') train_test_items = (train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")).distinct().join( test_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report([''] + list(trtsi.columns.values), [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()], title='Train / test item intersection') reporter.report_config(cfg) reporter.finish_document() logging.info('Splitting finished successfully')
def test(csv_report, all, dummy_test, separate_test, all_but_test, primary_pairs_test, custom_combos_test, non_zero_users_from_file): logging.info('Testing started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark context initialization') sc = SparkContext(cfg.spark.master, 'map_test: test') sqlContext = SQLContext(sc) logging.info('Test data reading') test_df = sqlContext.read.json(cfg.splitting.test_file).select( "entityId", "event", "targetEntityId").cache() test_data = test_df.filter("event = '%s'" % (cfg.testing.primary_event)).collect() #non_zero_users = set([r[0] for r in test_data][500:650]) # Because actually all our users have 0.0 scores -- too few data if all or dummy_test: logging.info('Train data reading') train_df = sqlContext.read.json(cfg.splitting.train_file).select( "entityId", "event", "targetEntityId").cache() counts = train_df.filter("event = '%s'" % (cfg.testing.primary_event)).groupBy( "targetEntityId").count().collect() sorted_rating = sorted( [(row.asDict()['count'], row.asDict()['targetEntityId']) for row in counts], reverse=True) elements = np.array([item for cnt, item in sorted_rating]) probs = np.array([cnt for cnt, item in sorted_rating]) probs = 1.0 * probs / probs.sum() logging.info('Process dummy test') # case 1. Random sampling from items (uniform) dummy_uniform_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=True, top=False, K=cfg.testing.map_k) # case 2. Random sampling from items (according to their distribution in training data) dummy_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=False, top=False, K=cfg.testing.map_k) # case 3. Top-N items from training data dummy_top_res = run_map_test_dummy(test_data, items=elements, probs=probs, uniform=True, top=True, K=cfg.testing.map_k) reporter.start_new_sheet('Dummy MAP benchmark') reporter.report( ['', 'Random uniform', 'Random sampled from train', 'Top - N'], [[('MAP @ %d' % i) for i in range(1, len(dummy_res) + 1)]] + [dummy_uniform_res, dummy_res, dummy_top_res], cfg=cfg) reporter.finish_sheet() logging.info('Process top 20 dummy test') scores = [] for i in range(20): scores.append( run_map_test_dummy(test_data, items=elements[i:], uniform=True, top=True, K=1, no_progress=True)[0]) reporter.start_new_sheet('Top-20 perfomance') reporter.report(['Rank', 'MAP@1'], [list(range(1, 21)), scores], bold_first_column=False, cfg=cfg) reporter.finish_sheet() if all or separate_test or all_but_test or primary_pairs_test or custom_combos_test: logging.info('Non zero users') if non_zero_users_from_file: with open(cfg.testing.non_zero_users_file) as input: non_zero_users = set(input.read().split(',')) else: _, r_data, _ = run_map_test(test_data, [cfg.testing.primary_event], test=False) non_zero_users = get_nonzero(r_data) with open(cfg.testing.non_zero_users_file, 'w') as output: output.write(','.join(non_zero_users)) if all or separate_test: logging.info('Process "map separate events" test') columns = [] for ev in cfg.testing.events: (r_scores, r_data, ipu) = run_map_test(test_data, [ev], users=non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0])) ] + ['non-zero users'] reporter.start_new_sheet('MAP separate events') reporter.report( ['event'] + cfg.testing.events, [first_column] + columns, selected_columns=[ cfg.testing.events.index(cfg.testing.primary_event) + 1 ], cfg=cfg) reporter.finish_sheet() if all or all_but_test: logging.info('Process "map all but..." test') events_scores = [] for ev in cfg.testing.events: evs = list(cfg.testing.events) evs.remove(ev) (r_scores, r_data, ipu) = run_map_test(test_data, evs, users=non_zero_users, test=False) events_scores.append(r_scores + [len(non_zero_users)]) evl = cfg.testing.events all_scores, r_data, ipu = run_map_test(test_data, evl, users=non_zero_users, test=False) all_scores.append(len(non_zero_users)) first_column = [('MAP @ %d' % i) for i in range(1, len(all_scores)) ] + ['non-zero users'] reporter.start_new_sheet('MAP all but...') reporter.report( ['event'] + cfg.testing.events + ['All'], [first_column] + events_scores + [all_scores], selected_columns=[ cfg.testing.events.index(cfg.testing.primary_event) + 1 ], cfg=cfg) reporter.finish_sheet() if all or primary_pairs_test: logging.info('Process "map pairs with primary" test') columns = [] events_without_primary = [ event for event in cfg.testing.events if event != cfg.testing.primary_event ] for event in events_without_primary: (r_scores, r_data, ipu) = run_map_test(test_data, [cfg.testing.primary_event, event], users=non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0])) ] + ['non-zero users'] reporter.start_new_sheet('MAP pairs with primary') reporter.report(['event'] + events_without_primary, [first_column] + columns, cfg=cfg) reporter.finish_sheet() if all or custom_combos_test: logging.info('Process "custom combos" test') columns = [] for event_group in cfg.testing.custom_combos.event_groups: if len( event_group ) == 2 and cfg.testing.primary_event in event_group and primary_pairs_test: logging.warn( "Report for group %s already generated in 'MAP pairs with primary'" % str(event_group)) continue if len(event_group) == 1 and separate_test: logging.warn( "Report for group %s already generated in 'MAP separate events'" % str(event_group)) continue if len(event_group) >= len( cfg.testing.events) - 1 and all_but_test: logging.warn( "Report for group %s already generated in 'All but...'" % str(event_group)) continue (r_scores, r_data, ipu) = run_map_test(test_data, event_group, users=non_zero_users, test=False) columns.append(r_scores + [len(non_zero_users)]) first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0])) ] + ['non-zero users'] reporter.start_new_sheet('Custom combos') reporter.report(['event'] + [ str([s.encode('utf-8') for s in group]) for group in cfg.testing.custom_combos.event_groups ], [first_column] + columns, cfg=cfg) reporter.finish_sheet() reporter.finish_document() logging.info('Testing finished successfully')