示例#1
0
def prepare_test(in_path, score_path, out_path):
    frame = pd.read_csv(in_path, sep='\t', header=None)
    frame['score'] = pd.read_csv(score_path, sep='\t', header=None)
    frame = frame.reset_index()
    frame.columns = ['pair_id', 'sentence_1', 'sentence_2', 'score']
    to_file(frame, out_path)
    return frame
def report(config, name):
    df = pd.read_csv(hydra.utils.to_absolute_path(config.data.raw_path))
    df.columns = ['pair_id', 'sentence_1', 'sentence_2', 'a_1', 'a2', 'a3', 'a4', 'a5', 'score']

    df['score_bin'] = df.score.map(lambda x: int(x))
    results = []
    cross_validation = StratifiedKFold(n_splits=10, random_state=config.data.random_state)

    for i, (train_index, test_index) in enumerate(cross_validation.split(df, df.score_bin.values)):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]

        to_file(train_df, hydra.utils.to_absolute_path(config.data.train_path))
        to_file(test_df, hydra.utils.to_absolute_path(config.data.test_path))

        model = train_model(config, train_df, name)

        train_df['similarity'] = sentence_similarity(model, train_df)
        test_df['similarity'] = sentence_similarity(model, test_df)

        train_correlation = pearson_correlation(train_df.similarity, train_df.score)
        test_correlation = pearson_correlation(test_df.similarity, test_df.score)
        logger.info('Baseline %s', name)
        logger.info('Cross Validation Split %s', i)
        logger.info('Train correlation %s', train_correlation)
        logger.info('Test correlation %s', test_correlation)
        results.append({'train': train_correlation, 'test': test_correlation})

    result_df = pd.DataFrame(results)

    logger.info('Baseline %s', name)
    logger.info('BIOSSES Train correlation %s', result_df.train.mean())
    logger.info('BIOSSES Test correlation %s', result_df.test.mean())