Пример #1
0
def split(intersections, csv_report):
    logging.info('Splitting started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: split')
    sqlContext = SQLContext(sc)

    logging.info('Source file reading')
    df = sqlContext.read.json(cfg.splitting.source_file)
    df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC"))

    users_with_event_count = df.groupBy(F.col("entityId").alias("user")).count()


    logging.info('Filter users with small number of events')
    min_events = 10
    users_with_few_events = (users_with_event_count
                             .filter("count < %d" % (min_events))
                             .select(F.col("user").alias("user_with_few_events")))
    ndf = df.join(users_with_few_events,
                  F.col("entityId")==F.col("user_with_few_events"),
                  how="left_outer")
    df1 = ndf.filter("user_with_few_events is NULL").drop("user_with_few_events")


    logging.info('Split data into train and test')
    train_df, test_df = split_data(df)
    train_df.write.json(cfg.splitting.train_file, mode="overwrite")
    test_df.write.json(cfg.splitting.test_file, mode="overwrite")


    train_df = train_df.select("entityId", "event", "targetEntityId").cache()
    test_df = test_df.select("entityId", "event", "targetEntityId").cache()


    logging.info('Calculation of different stat metrics of datasets')
    events_by_type = (df
                      .groupBy("event")
                      .count()
                      .select(F.col("event"), F.col("count").alias("count_total"))
                      .toPandas())

    events_by_type_test = (test_df
                           .groupBy("event")
                           .count()
                           .select(F.col("event"), F.col("count").alias("count_test"))
                           .toPandas()
                           .set_index("event"))

    events_by_type_train = (train_df
                            .groupBy("event")
                            .count()
                            .select(F.col("event"), F.col("count").alias("count_train"))
                            .toPandas()
                            .set_index("event"))

    unique_users_by_event = (df
                             .select(F.col("entityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_users_total"))
                             .toPandas()
                             .set_index("event"))

    unique_users_by_event_train = (train_df
                                   .select(F.col("entityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_users_train"))
                                   .toPandas()
                                   .set_index("event"))

    unique_users_by_event_test = (test_df
                                  .select(F.col("entityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_users_test"))
                                  .toPandas()
                                  .set_index("event"))

    unique_items_by_event = (df
                             .select(F.col("targetEntityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_items_total"))
                             .toPandas()
                             .set_index("event"))

    unique_items_by_event_train = (train_df
                                   .select(F.col("targetEntityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_items_train"))
                                   .toPandas()
                                   .set_index("event"))

    unique_items_by_event_test = (test_df
                                  .select(F.col("targetEntityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_items_test"))
                                  .toPandas()
                                  .set_index("event"))

    logging.info('Calculate total counts')
    events = df.count()
    events_train = train_df.count()
    events_test = test_df.count()

    unique_users = df.select("entityId").distinct().count()
    unique_users_train = train_df.select("entityId").distinct().count()
    unique_users_test = test_df.select("entityId").distinct().count()

    unique_items = df.select(F.col("targetEntityId")).distinct().count()
    unique_items_train = train_df.select(F.col("targetEntityId")).distinct().count()
    unique_items_test = test_df.select(F.col("targetEntityId")).distinct().count()

    info_df = events_by_type
    dfs = [unique_users_by_event, unique_items_by_event,
            events_by_type_train, events_by_type_test,
            unique_users_by_event_train, unique_users_by_event_test,
            unique_items_by_event_train, unique_items_by_event_test]

    for data_frame in dfs:
        info_df = info_df.join(data_frame, on="event")

    n_rows, n_cols = info_df.shape

    # totals
    info_df.loc[n_rows] = ['ANY EVENT', events, unique_users, unique_items,
                        events_train, events_test,
                        unique_users_train, unique_users_test,
                        unique_items_train, unique_items_test]

    info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2])
    info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3])

    logging.info('Create event stat worksheet')
    reporter.start_new_sheet('Events stat')
    reporter.report(
        ['event', 'event count', 'unique users', 'unique items',
         'events per user', 'events per item',
         'event count train', 'event count test',
         'unique users train', 'unique users test',
         'unique items train', 'unique items test'],
        [column.tolist() for _, column in info_df.iteritems()],
        selected_rows=[next(info_df.iteritems())[1].tolist().index(cfg.testing.primary_event)],
        cfg=cfg)
    reporter.finish_sheet()

    if intersections:
        logging.info('Start intersections calculation')

        reporter.start_new_sheet('Intersections')

        columns_for_matrix = cfg.testing.events
        logging.info('Process train / train user intersection')
        train_train_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtru = mk_intersection_matrix(train_train_users, columns_for_matrix)
        reporter.report(
            [''] + list(trtru.columns.values),
            [trtru.index.tolist()] + [column for _, column in trtru.iteritems()],
            title='Train / train user intersection')

        logging.info('Process train / test user intersection')
        train_test_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsu.columns.values),
            [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()],
            title='Train / test user intersection')

        logging.info('Process train / train item intersection')
        train_train_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtri = mk_intersection_matrix(train_train_items, columns_for_matrix)
        reporter.report(
            [''] + list(trtri.columns.values),
            [trtri.index.tolist()] + [column for _, column in trtri.iteritems()],
            title='Train / train item intersection'
        )

        logging.info('Process train / test item intersection')
        train_test_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsi.columns.values),
            [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()],
            title='Train / test item intersection'
        )

        reporter.report_config(cfg)

    reporter.finish_document()
    logging.info('Splitting finished successfully')
Пример #2
0
def split(intersections, csv_report):
    logging.info('Splitting started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: split')
    sqlContext = SQLContext(sc)

    logging.info('Source file reading')
    df = sqlContext.read.json(cfg.splitting.source_file)
    df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC"))

    users_with_event_count = df.groupBy(
        F.col("entityId").alias("user")).count()

    logging.info('Filter users with small number of events')
    min_events = 10
    users_with_few_events = (users_with_event_count.filter(
        "count < %d" % (min_events)).select(
            F.col("user").alias("user_with_few_events")))
    ndf = df.join(users_with_few_events,
                  F.col("entityId") == F.col("user_with_few_events"),
                  how="left_outer")
    df1 = ndf.filter("user_with_few_events is NULL").drop(
        "user_with_few_events")

    logging.info('Split data into train and test')
    train_df, test_df = split_data(df)
    train_df.write.json(cfg.splitting.train_file, mode="overwrite")
    test_df.write.json(cfg.splitting.test_file, mode="overwrite")

    train_df = train_df.select("entityId", "event", "targetEntityId").cache()
    test_df = test_df.select("entityId", "event", "targetEntityId").cache()

    logging.info('Calculation of different stat metrics of datasets')
    events_by_type = (df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_total")).toPandas())

    events_by_type_test = (test_df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_test")).toPandas().set_index("event"))

    events_by_type_train = (train_df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_train")).toPandas().set_index("event"))

    unique_users_by_event = (df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_total")).toPandas().set_index(
                "event"))

    unique_users_by_event_train = (train_df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_train")).toPandas().set_index(
                "event"))

    unique_users_by_event_test = (test_df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_test")).toPandas().set_index(
                "event"))

    unique_items_by_event = (df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_total")).toPandas().set_index(
                "event"))

    unique_items_by_event_train = (train_df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_train")).toPandas().set_index(
                "event"))

    unique_items_by_event_test = (test_df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_test")).toPandas().set_index(
                "event"))

    logging.info('Calculate total counts')
    events = df.count()
    events_train = train_df.count()
    events_test = test_df.count()

    unique_users = df.select("entityId").distinct().count()
    unique_users_train = train_df.select("entityId").distinct().count()
    unique_users_test = test_df.select("entityId").distinct().count()

    unique_items = df.select(F.col("targetEntityId")).distinct().count()
    unique_items_train = train_df.select(
        F.col("targetEntityId")).distinct().count()
    unique_items_test = test_df.select(
        F.col("targetEntityId")).distinct().count()

    info_df = events_by_type
    dfs = [
        unique_users_by_event, unique_items_by_event, events_by_type_train,
        events_by_type_test, unique_users_by_event_train,
        unique_users_by_event_test, unique_items_by_event_train,
        unique_items_by_event_test
    ]

    for data_frame in dfs:
        info_df = info_df.join(data_frame, on="event")

    n_rows, n_cols = info_df.shape

    # totals
    info_df.loc[n_rows] = [
        'ANY EVENT', events, unique_users, unique_items, events_train,
        events_test, unique_users_train, unique_users_test, unique_items_train,
        unique_items_test
    ]

    info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2])
    info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3])

    logging.info('Create event stat worksheet')
    reporter.start_new_sheet('Events stat')
    reporter.report([
        'event', 'event count', 'unique users', 'unique items',
        'events per user', 'events per item', 'event count train',
        'event count test', 'unique users train', 'unique users test',
        'unique items train', 'unique items test'
    ], [column.tolist() for _, column in info_df.iteritems()],
                    selected_rows=[
                        next(info_df.iteritems())[1].tolist().index(
                            cfg.testing.primary_event)
                    ],
                    cfg=cfg)
    reporter.finish_sheet()

    if intersections:
        logging.info('Start intersections calculation')

        reporter.start_new_sheet('Intersections')

        columns_for_matrix = cfg.testing.events
        logging.info('Process train / train user intersection')
        train_train_users = (train_df.select(
            F.col("entityId").alias("user"),
            F.col("event").alias("event_left")).distinct().join(
                train_df.select(
                    F.col("entityId").alias("user"),
                    F.col("event").alias("event_right")).distinct(),
                on="user",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtru = mk_intersection_matrix(train_train_users, columns_for_matrix)
        reporter.report([''] + list(trtru.columns.values),
                        [trtru.index.tolist()] +
                        [column for _, column in trtru.iteritems()],
                        title='Train / train user intersection')

        logging.info('Process train / test user intersection')
        train_test_users = (train_df.select(
            F.col("entityId").alias("user"),
            F.col("event").alias("event_left")).distinct().join(
                test_df.select(
                    F.col("entityId").alias("user"),
                    F.col("event").alias("event_right")).distinct(),
                on="user",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtsu = mk_intersection_matrix(train_test_users,
                                       columns_for_matrix,
                                       horizontal_suffix=" train",
                                       vertical_suffix=" test")
        reporter.report([''] + list(trtsu.columns.values),
                        [trtsu.index.tolist()] +
                        [column for _, column in trtsu.iteritems()],
                        title='Train / test user intersection')

        logging.info('Process train / train item intersection')
        train_train_items = (train_df.select(
            F.col("targetEntityId").alias("item"),
            F.col("event").alias("event_left")).distinct().join(
                train_df.select(
                    F.col("targetEntityId").alias("item"),
                    F.col("event").alias("event_right")).distinct(),
                on="item",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtri = mk_intersection_matrix(train_train_items, columns_for_matrix)
        reporter.report([''] + list(trtri.columns.values),
                        [trtri.index.tolist()] +
                        [column for _, column in trtri.iteritems()],
                        title='Train / train item intersection')

        logging.info('Process train / test item intersection')
        train_test_items = (train_df.select(
            F.col("targetEntityId").alias("item"),
            F.col("event").alias("event_left")).distinct().join(
                test_df.select(
                    F.col("targetEntityId").alias("item"),
                    F.col("event").alias("event_right")).distinct(),
                on="item",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtsi = mk_intersection_matrix(train_test_items,
                                       columns_for_matrix,
                                       horizontal_suffix=" train",
                                       vertical_suffix=" test")
        reporter.report([''] + list(trtsi.columns.values),
                        [trtsi.index.tolist()] +
                        [column for _, column in trtsi.iteritems()],
                        title='Train / test item intersection')

        reporter.report_config(cfg)

    reporter.finish_document()
    logging.info('Splitting finished successfully')
Пример #3
0
def test(csv_report,
         all,
         dummy_test,
         separate_test,
         all_but_test,
         primary_pairs_test,
         custom_combos_test,
         non_zero_users_from_file):

    logging.info('Testing started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark context initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: test')
    sqlContext = SQLContext(sc)

    logging.info('Test data reading')
    test_df = sqlContext.read.json(cfg.splitting.test_file).select("entityId", "event", "targetEntityId").cache()

    test_data = test_df.filter("event = '%s'" % (cfg.testing.primary_event)).collect()

    #non_zero_users = set([r[0] for r in test_data][500:650]) # Because actually all our users have 0.0 scores -- too few data

    if all or dummy_test:
        logging.info('Train data reading')

        train_df = sqlContext.read.json(cfg.splitting.train_file).select("entityId", "event", "targetEntityId").cache()
        counts = train_df.filter("event = '%s'" % (cfg.testing.primary_event)).groupBy("targetEntityId").count().collect()

        sorted_rating = sorted([(row.asDict()['count'], row.asDict()['targetEntityId']) for row in counts], reverse=True)
        elements = np.array([item for cnt, item in sorted_rating])
        probs = np.array([cnt for cnt, item in sorted_rating])
        probs = 1.0 * probs / probs.sum()

        logging.info('Process dummy test')
        # case 1. Random sampling from items (uniform)
        dummy_uniform_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                               uniform=True, top=False, K=cfg.testing.map_k)

        # case 2. Random sampling from items (according to their distribution in training data)
        dummy_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                       uniform=False, top=False, K=cfg.testing.map_k)

        # case 3. Top-N items from training data
        dummy_top_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                           uniform=True, top=True, K=cfg.testing.map_k)

        reporter.start_new_sheet('Dummy MAP benchmark')
        reporter.report(
            ['', 'Random uniform', 'Random sampled from train', 'Top - N'],
            [[('MAP @ %d' % i) for i in range(1, len(dummy_res)+1)]] + [dummy_uniform_res, dummy_res, dummy_top_res],
            cfg=cfg
        )
        reporter.finish_sheet()

        logging.info('Process top 20 dummy test')
        scores = []
        for i in range(20):
            scores.append(run_map_test_dummy(test_data, items=elements[i:], uniform=True,
                                             top=True, K=1, no_progress=True)[0])

        reporter.start_new_sheet('Top-20 perfomance')
        reporter.report(
            ['Rank', 'MAP@1'],
            [list(range(1, 21)), scores],
            bold_first_column=False,
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or separate_test or all_but_test or primary_pairs_test or custom_combos_test:
        logging.info('Non zero users')
        if non_zero_users_from_file:
            with open(cfg.testing.non_zero_users_file) as input:
                non_zero_users = set(input.read().split(','))
        else:
            _, r_data, _ = run_map_test(test_data, [cfg.testing.primary_event], test=False)
            non_zero_users = get_nonzero(r_data)
            with open(cfg.testing.non_zero_users_file, 'w') as output:
                output.write(','.join(non_zero_users))

    if all or separate_test:
        logging.info('Process "map separate events" test')
        columns = []
        for ev in cfg.testing.events:
            (r_scores, r_data, ipu) = run_map_test(test_data, [ev], users=non_zero_users, test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

        reporter.start_new_sheet('MAP separate events')
        reporter.report(
            ['event'] + cfg.testing.events,
            [first_column] + columns,
            selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1],
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or all_but_test:
        logging.info('Process "map all but..." test')
        events_scores = []
        for ev in cfg.testing.events:
            evs = list(cfg.testing.events)
            evs.remove(ev)
            (r_scores, r_data, ipu) = run_map_test(test_data, evs, users=non_zero_users, test=False)
            events_scores.append(r_scores + [len(non_zero_users)])

        evl = cfg.testing.events
        all_scores, r_data, ipu = run_map_test(test_data, evl, users=non_zero_users, test=False)
        all_scores.append(len(non_zero_users))

        first_column = [('MAP @ %d' % i) for i in range(1, len(all_scores))] + ['non-zero users']
        reporter.start_new_sheet('MAP all but...')
        reporter.report(
            ['event'] + cfg.testing.events + ['All'],
            [first_column] + events_scores + [all_scores],
            selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1],
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or primary_pairs_test:
        logging.info('Process "map pairs with primary" test')
        columns = []
        events_without_primary = [event for event in cfg.testing.events if event != cfg.testing.primary_event]
        for event in events_without_primary:
            (r_scores, r_data, ipu) = run_map_test(test_data, [cfg.testing.primary_event, event],
                                                   users=non_zero_users, test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

        reporter.start_new_sheet('MAP pairs with primary')
        reporter.report(
            ['event'] + events_without_primary,
            [first_column] + columns,
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or custom_combos_test:
        logging.info('Process "custom combos" test')
        columns = []
        for event_group in cfg.testing.custom_combos.event_groups:
            if len(event_group) == 2 and cfg.testing.primary_event in event_group and primary_pairs_test:
                logging.warn("Report for group %s already generated in 'MAP pairs with primary'" % str(event_group))
                continue

            if len(event_group) == 1 and separate_test:
                logging.warn("Report for group %s already generated in 'MAP separate events'" % str(event_group))
                continue

            if len(event_group) >= len(cfg.testing.events) - 1 and all_but_test:
                logging.warn("Report for group %s already generated in 'All but...'" % str(event_group))
                continue

            (r_scores, r_data, ipu) = run_map_test(test_data, event_group,
                                                   users=non_zero_users, test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

        reporter.start_new_sheet('Custom combos')
        reporter.report(
            ['event'] + [str([s.encode('utf-8') for s in group]) for group in cfg.testing.custom_combos.event_groups],
            [first_column] + columns,
            cfg=cfg
        )
        reporter.finish_sheet()

    reporter.finish_document()
    logging.info('Testing finished successfully')
Пример #4
0
def test(csv_report, all, dummy_test, separate_test, all_but_test,
         primary_pairs_test, custom_combos_test, non_zero_users_from_file):

    logging.info('Testing started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark context initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: test')
    sqlContext = SQLContext(sc)

    logging.info('Test data reading')
    test_df = sqlContext.read.json(cfg.splitting.test_file).select(
        "entityId", "event", "targetEntityId").cache()

    test_data = test_df.filter("event = '%s'" %
                               (cfg.testing.primary_event)).collect()

    #non_zero_users = set([r[0] for r in test_data][500:650]) # Because actually all our users have 0.0 scores -- too few data

    if all or dummy_test:
        logging.info('Train data reading')

        train_df = sqlContext.read.json(cfg.splitting.train_file).select(
            "entityId", "event", "targetEntityId").cache()
        counts = train_df.filter("event = '%s'" %
                                 (cfg.testing.primary_event)).groupBy(
                                     "targetEntityId").count().collect()

        sorted_rating = sorted(
            [(row.asDict()['count'], row.asDict()['targetEntityId'])
             for row in counts],
            reverse=True)
        elements = np.array([item for cnt, item in sorted_rating])
        probs = np.array([cnt for cnt, item in sorted_rating])
        probs = 1.0 * probs / probs.sum()

        logging.info('Process dummy test')
        # case 1. Random sampling from items (uniform)
        dummy_uniform_res = run_map_test_dummy(test_data,
                                               items=elements,
                                               probs=probs,
                                               uniform=True,
                                               top=False,
                                               K=cfg.testing.map_k)

        # case 2. Random sampling from items (according to their distribution in training data)
        dummy_res = run_map_test_dummy(test_data,
                                       items=elements,
                                       probs=probs,
                                       uniform=False,
                                       top=False,
                                       K=cfg.testing.map_k)

        # case 3. Top-N items from training data
        dummy_top_res = run_map_test_dummy(test_data,
                                           items=elements,
                                           probs=probs,
                                           uniform=True,
                                           top=True,
                                           K=cfg.testing.map_k)

        reporter.start_new_sheet('Dummy MAP benchmark')
        reporter.report(
            ['', 'Random uniform', 'Random sampled from train', 'Top - N'],
            [[('MAP @ %d' % i) for i in range(1,
                                              len(dummy_res) + 1)]] +
            [dummy_uniform_res, dummy_res, dummy_top_res],
            cfg=cfg)
        reporter.finish_sheet()

        logging.info('Process top 20 dummy test')
        scores = []
        for i in range(20):
            scores.append(
                run_map_test_dummy(test_data,
                                   items=elements[i:],
                                   uniform=True,
                                   top=True,
                                   K=1,
                                   no_progress=True)[0])

        reporter.start_new_sheet('Top-20 perfomance')
        reporter.report(['Rank', 'MAP@1'], [list(range(1, 21)), scores],
                        bold_first_column=False,
                        cfg=cfg)
        reporter.finish_sheet()

    if all or separate_test or all_but_test or primary_pairs_test or custom_combos_test:
        logging.info('Non zero users')
        if non_zero_users_from_file:
            with open(cfg.testing.non_zero_users_file) as input:
                non_zero_users = set(input.read().split(','))
        else:
            _, r_data, _ = run_map_test(test_data, [cfg.testing.primary_event],
                                        test=False)
            non_zero_users = get_nonzero(r_data)
            with open(cfg.testing.non_zero_users_file, 'w') as output:
                output.write(','.join(non_zero_users))

    if all or separate_test:
        logging.info('Process "map separate events" test')
        columns = []
        for ev in cfg.testing.events:
            (r_scores, r_data, ipu) = run_map_test(test_data, [ev],
                                                   users=non_zero_users,
                                                   test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))
                        ] + ['non-zero users']

        reporter.start_new_sheet('MAP separate events')
        reporter.report(
            ['event'] + cfg.testing.events, [first_column] + columns,
            selected_columns=[
                cfg.testing.events.index(cfg.testing.primary_event) + 1
            ],
            cfg=cfg)
        reporter.finish_sheet()

    if all or all_but_test:
        logging.info('Process "map all but..." test')
        events_scores = []
        for ev in cfg.testing.events:
            evs = list(cfg.testing.events)
            evs.remove(ev)
            (r_scores, r_data, ipu) = run_map_test(test_data,
                                                   evs,
                                                   users=non_zero_users,
                                                   test=False)
            events_scores.append(r_scores + [len(non_zero_users)])

        evl = cfg.testing.events
        all_scores, r_data, ipu = run_map_test(test_data,
                                               evl,
                                               users=non_zero_users,
                                               test=False)
        all_scores.append(len(non_zero_users))

        first_column = [('MAP @ %d' % i) for i in range(1, len(all_scores))
                        ] + ['non-zero users']
        reporter.start_new_sheet('MAP all but...')
        reporter.report(
            ['event'] + cfg.testing.events + ['All'],
            [first_column] + events_scores + [all_scores],
            selected_columns=[
                cfg.testing.events.index(cfg.testing.primary_event) + 1
            ],
            cfg=cfg)
        reporter.finish_sheet()

    if all or primary_pairs_test:
        logging.info('Process "map pairs with primary" test')
        columns = []
        events_without_primary = [
            event for event in cfg.testing.events
            if event != cfg.testing.primary_event
        ]
        for event in events_without_primary:
            (r_scores, r_data,
             ipu) = run_map_test(test_data, [cfg.testing.primary_event, event],
                                 users=non_zero_users,
                                 test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))
                        ] + ['non-zero users']

        reporter.start_new_sheet('MAP pairs with primary')
        reporter.report(['event'] + events_without_primary,
                        [first_column] + columns,
                        cfg=cfg)
        reporter.finish_sheet()

    if all or custom_combos_test:
        logging.info('Process "custom combos" test')
        columns = []
        for event_group in cfg.testing.custom_combos.event_groups:
            if len(
                    event_group
            ) == 2 and cfg.testing.primary_event in event_group and primary_pairs_test:
                logging.warn(
                    "Report for group %s already generated in 'MAP pairs with primary'"
                    % str(event_group))
                continue

            if len(event_group) == 1 and separate_test:
                logging.warn(
                    "Report for group %s already generated in 'MAP separate events'"
                    % str(event_group))
                continue

            if len(event_group) >= len(
                    cfg.testing.events) - 1 and all_but_test:
                logging.warn(
                    "Report for group %s already generated in 'All but...'" %
                    str(event_group))
                continue

            (r_scores, r_data, ipu) = run_map_test(test_data,
                                                   event_group,
                                                   users=non_zero_users,
                                                   test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))
                        ] + ['non-zero users']

        reporter.start_new_sheet('Custom combos')
        reporter.report(['event'] + [
            str([s.encode('utf-8') for s in group])
            for group in cfg.testing.custom_combos.event_groups
        ], [first_column] + columns,
                        cfg=cfg)
        reporter.finish_sheet()

    reporter.finish_document()
    logging.info('Testing finished successfully')