def main(files): utils.print_system_info() utils.init_pandas() _logger.info("FILES=" + str(files)) # Load feature file for some statistics features = featurelist.get_meta_list() + featurelist.get_label_list() df = loading.load_df(files, featurelist.get_columns(features)) test_set_start = DataSet.get_index_for_revision_id_from_df( df, constants.TEST_SET_START) tail_set_start = DataSet.get_index_for_revision_id_from_df( df, constants.TAIL_SET_START) df = df[test_set_start:tail_set_start] data = DataSet() data.set_meta(df.iloc[:, :-1]) data.set_Y(df.iloc[:, -1].astype(np.float32)) data.set_X(np.zeros((len(data), 1))) _logger.debug("Length of data: " + str(len(data))) # Load scores scores = pd.DataFrame() scores[REVISION_ID] = data.get_revision_ids() scores.set_index(REVISION_ID, inplace=True) for team, score_file in files['teams'].items(): team_scores = load_vandalism_scores(score_file) team_scores.set_index(REVISION_ID, inplace=True) scores[team] = team_scores[VANDALISM_SCORE] scores.dropna(inplace=True) if len(data) != len(scores): raise Exception( "number of scores does not fit test set size: " + "len(data)={0} but len(scores)={1}".format(len(data), len(scores))) _logger.debug("Length of scores: " + str(len(data))) # Evaluate teams meta_scores = compute_meta_scores(scores) scores = pd.concat([scores, meta_scores], axis=1) evaluate_teams(scores, data, save_scores=['META']) evaluate_teams_over_time(scores, data, EVALUATION_OVER_TIME_SUFFIX) scores, data = clean_data(scores, data) evaluate_teams(scores, data, suffix=EVALUATION_RESULTS_CLEANED_SUFFIX)
def get_splitting_indices(data, use_test_set): training_set_start = constants.TRAINING_SET_START if use_test_set: validation_set_start = constants.TEST_SET_START test_set_start = constants.TAIL_SET_START else: validation_set_start = constants.VALIDATION_SET_START, test_set_start = constants.TEST_SET_START # transform revision id to index in data set training_set_start = DataSet.get_index_for_revision_id_from_df( data, training_set_start) validation_set_start = DataSet.get_index_for_revision_id_from_df( data, validation_set_start) test_set_start = DataSet.get_index_for_revision_id_from_df( data, test_set_start) return training_set_start, validation_set_start, test_set_start
def compute_data_frame(data): _logger.debug("Splitting statistics...") training_set_start_index = 0 # compute statistics from start of dataset validation_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.VALIDATION_SET_START) test_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START) tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START) training_set = data[ training_set_start_index:validation_set_start_index] validation_set = data[validation_set_start_index:test_set_start_index] test_set = data[test_set_start_index:tail_set_start_index] result = [] result.append( compute_splitting_statistics_row(training_set, 'Training')) result.append( compute_splitting_statistics_row(validation_set, 'Validation')) result.append(compute_splitting_statistics_row(test_set, 'Test')) result = pd.concat(result, axis=0) return result
def omit_holdout_df(df): """Omit the holdout dataframe.""" tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(df, constants.TAIL_SET_START) df = df[:tail_set_start_index] return df
def _compute_backpressure_statistics(data): # Restrict computation to test dataset test_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START) tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START) data = data[test_set_start_index:tail_set_start_index] data = data[[ REVISION_ID, ITEM_ID, USER_NAME, REVISION_ACTION, ROLLBACK_REVERTED ]] REVISION_ID_INDEX = 0 # noqa ITEM_ID_INDEX = 1 USER_NAME_INDEX = 2 REVISION_ACTION_INDEX = 3 ROLLBACK_REVERTED_INDEX = 4 # noqa data = data.values result = np.full(len(data), np.nan) revealed = pd.DataFrame() for i in range(len(data)): user_name = data[i][USER_NAME_INDEX] item_id = data[i][ITEM_ID_INDEX] prev_rev = data[i] for j in range(i + 1, min(len(data), i + 16)): rev = data[j] if rev[ITEM_ID_INDEX] == item_id: # Rollback within same session (same item id and same user name) if rev[USER_NAME_INDEX] == user_name: if rev[REVISION_ACTION_INDEX] == 'rollback': result[i] = True revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break # Rollback at beginning of next session else: if rev[REVISION_ACTION_INDEX] == 'rollback': result[i] = True revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break else: result[i] = False revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break n_revisions = result.size n_revealed_total = (~(np.isnan(result))).sum() n_revealed_regular = (result == True).sum() # noqa n_revealed_vandalism = (result == False).sum() # noqa _logger.info('n_revisions: ' + str(n_revisions)) _logger.info('n_revealed_total: ' + str(n_revealed_total)) _logger.info('n_revealed_vandalism: ' + str(n_revealed_vandalism)) _logger.info('n_revealed_regular: ' + str(n_revealed_regular))