Пример #1
0
def main(files):
    utils.print_system_info()
    utils.init_pandas()

    _logger.info("FILES=" + str(files))

    # Load feature file for some statistics
    features = featurelist.get_meta_list() + featurelist.get_label_list()
    df = loading.load_df(files, featurelist.get_columns(features))
    test_set_start = DataSet.get_index_for_revision_id_from_df(
        df, constants.TEST_SET_START)
    tail_set_start = DataSet.get_index_for_revision_id_from_df(
        df, constants.TAIL_SET_START)
    df = df[test_set_start:tail_set_start]
    data = DataSet()
    data.set_meta(df.iloc[:, :-1])
    data.set_Y(df.iloc[:, -1].astype(np.float32))
    data.set_X(np.zeros((len(data), 1)))
    _logger.debug("Length of data: " + str(len(data)))

    # Load scores
    scores = pd.DataFrame()
    scores[REVISION_ID] = data.get_revision_ids()
    scores.set_index(REVISION_ID, inplace=True)

    for team, score_file in files['teams'].items():
        team_scores = load_vandalism_scores(score_file)
        team_scores.set_index(REVISION_ID, inplace=True)
        scores[team] = team_scores[VANDALISM_SCORE]

    scores.dropna(inplace=True)
    if len(data) != len(scores):
        raise Exception(
            "number of scores does not fit test set size: " +
            "len(data)={0} but len(scores)={1}".format(len(data), len(scores)))

    _logger.debug("Length of scores: " + str(len(data)))

    # Evaluate teams
    meta_scores = compute_meta_scores(scores)
    scores = pd.concat([scores, meta_scores], axis=1)

    evaluate_teams(scores, data, save_scores=['META'])
    evaluate_teams_over_time(scores, data, EVALUATION_OVER_TIME_SUFFIX)

    scores, data = clean_data(scores, data)
    evaluate_teams(scores, data, suffix=EVALUATION_RESULTS_CLEANED_SUFFIX)
def get_splitting_indices(data, use_test_set):
    training_set_start = constants.TRAINING_SET_START

    if use_test_set:
        validation_set_start = constants.TEST_SET_START
        test_set_start = constants.TAIL_SET_START
    else:
        validation_set_start = constants.VALIDATION_SET_START,
        test_set_start = constants.TEST_SET_START

    # transform revision id to index in data set
    training_set_start = DataSet.get_index_for_revision_id_from_df(
        data, training_set_start)
    validation_set_start = DataSet.get_index_for_revision_id_from_df(
        data, validation_set_start)
    test_set_start = DataSet.get_index_for_revision_id_from_df(
        data, test_set_start)

    return training_set_start, validation_set_start, test_set_start
Пример #3
0
    def compute_data_frame(data):
        _logger.debug("Splitting statistics...")
        training_set_start_index = 0  # compute statistics from start of dataset
        validation_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.VALIDATION_SET_START)
        test_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START)
        tail_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START)

        training_set = data[
            training_set_start_index:validation_set_start_index]
        validation_set = data[validation_set_start_index:test_set_start_index]
        test_set = data[test_set_start_index:tail_set_start_index]

        result = []
        result.append(
            compute_splitting_statistics_row(training_set, 'Training'))
        result.append(
            compute_splitting_statistics_row(validation_set, 'Validation'))
        result.append(compute_splitting_statistics_row(test_set, 'Test'))

        result = pd.concat(result, axis=0)
        return result
def omit_holdout_df(df):
    """Omit the holdout dataframe."""
    tail_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(df, constants.TAIL_SET_START)
    df = df[:tail_set_start_index]
    return df
Пример #5
0
def _compute_backpressure_statistics(data):
    # Restrict computation to test dataset
    test_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START)
    tail_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START)
    data = data[test_set_start_index:tail_set_start_index]

    data = data[[
        REVISION_ID, ITEM_ID, USER_NAME, REVISION_ACTION, ROLLBACK_REVERTED
    ]]

    REVISION_ID_INDEX = 0  # noqa
    ITEM_ID_INDEX = 1
    USER_NAME_INDEX = 2
    REVISION_ACTION_INDEX = 3
    ROLLBACK_REVERTED_INDEX = 4  # noqa

    data = data.values

    result = np.full(len(data), np.nan)
    revealed = pd.DataFrame()

    for i in range(len(data)):
        user_name = data[i][USER_NAME_INDEX]
        item_id = data[i][ITEM_ID_INDEX]

        prev_rev = data[i]

        for j in range(i + 1, min(len(data), i + 16)):
            rev = data[j]

            if rev[ITEM_ID_INDEX] == item_id:
                # Rollback within same session (same item id and same user name)
                if rev[USER_NAME_INDEX] == user_name:
                    if rev[REVISION_ACTION_INDEX] == 'rollback':
                        result[i] = True
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break
                # Rollback at beginning of next session
                else:
                    if rev[REVISION_ACTION_INDEX] == 'rollback':
                        result[i] = True
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break
                    else:
                        result[i] = False
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break

    n_revisions = result.size
    n_revealed_total = (~(np.isnan(result))).sum()
    n_revealed_regular = (result == True).sum()  # noqa
    n_revealed_vandalism = (result == False).sum()  # noqa

    _logger.info('n_revisions: ' + str(n_revisions))
    _logger.info('n_revealed_total: ' + str(n_revealed_total))
    _logger.info('n_revealed_vandalism: ' + str(n_revealed_vandalism))
    _logger.info('n_revealed_regular: ' + str(n_revealed_regular))