def prepare_test(in_path, score_path, out_path): frame = pd.read_csv(in_path, sep='\t', header=None) frame['score'] = pd.read_csv(score_path, sep='\t', header=None) frame = frame.reset_index() frame.columns = ['pair_id', 'sentence_1', 'sentence_2', 'score'] to_file(frame, out_path) return frame
def report(config, name): df = pd.read_csv(hydra.utils.to_absolute_path(config.data.raw_path)) df.columns = ['pair_id', 'sentence_1', 'sentence_2', 'a_1', 'a2', 'a3', 'a4', 'a5', 'score'] df['score_bin'] = df.score.map(lambda x: int(x)) results = [] cross_validation = StratifiedKFold(n_splits=10, random_state=config.data.random_state) for i, (train_index, test_index) in enumerate(cross_validation.split(df, df.score_bin.values)): train_df = df.iloc[train_index] test_df = df.iloc[test_index] to_file(train_df, hydra.utils.to_absolute_path(config.data.train_path)) to_file(test_df, hydra.utils.to_absolute_path(config.data.test_path)) model = train_model(config, train_df, name) train_df['similarity'] = sentence_similarity(model, train_df) test_df['similarity'] = sentence_similarity(model, test_df) train_correlation = pearson_correlation(train_df.similarity, train_df.score) test_correlation = pearson_correlation(test_df.similarity, test_df.score) logger.info('Baseline %s', name) logger.info('Cross Validation Split %s', i) logger.info('Train correlation %s', train_correlation) logger.info('Test correlation %s', test_correlation) results.append({'train': train_correlation, 'test': test_correlation}) result_df = pd.DataFrame(results) logger.info('Baseline %s', name) logger.info('BIOSSES Train correlation %s', result_df.train.mean()) logger.info('BIOSSES Test correlation %s', result_df.test.mean())