def predict(self,dataset,summaries,groups,test_groups): if test_groups is not None: features = readFeatures(self.feature_types,dataset,summaries,groups,test_groups) else: topics = sorted(set([gg.split('-')[1] for gg in groups if dataset in gg])) features = readFeatures(self.feature_types,dataset,summaries,groups,topics) ### for directly returning features #return list(features.reshape(1,-1))[0] return self.lin_reg.predict(features)
def train(self,dataset,summaries,groups,train_groups,targets): if train_groups is not None: features = readFeatures(self.feature_types,dataset,summaries,groups,train_groups) else: features = None for dd in dataset: topics = [gg.split('-')[1] for gg in groups if dd in gg] ff = readFeatures(self.feature_types,dd,summaries,groups,topics) if features is None: features = np.copy(ff) else: features = np.append(features,ff,axis=0) self.lin_reg.fit(features,targets)
def predict(self, dataset, summaries, groups, test_groups): if test_groups is not None: features = readFeatures(self.feature_types, dataset, summaries, groups, test_groups) else: topics = sorted( set([gg.split('-')[1] for gg in groups if dataset in gg])) features = readFeatures(self.feature_types, dataset, summaries, groups, topics) weights = self.best_weights['combination.weight'].data.numpy() biases = self.best_weights['combination.bias'].data.numpy() if 'entropy' in self.loss_type: weights = weights[1] - weights[0] aa = np.dot(features, weights) + biases[1] - biases[0] else: aa = np.dot(features, weights.reshape(-1, 1)) + biases return normaliseList([i[0] for i in aa])
def predict(self, dataset, summaries, groups, test_groups): if test_groups is not None: features = readFeatures(self.feature_types, dataset, summaries, groups, test_groups) else: topics = sorted( set([gg.split('-')[1] for gg in groups if dataset in gg])) features = readFeatures(self.feature_types, dataset, summaries, groups, topics) ''' print('before random feature, ', features.shape) random_feature = np.array([np.random.random() for i in range(features.shape[0])]) print('random feature size', random_feature.shape) features = np.c_[features,random_feature] print('after random feature, ', features.shape) ''' aa = np.dot(features, np.array(self.best_weights)) return normaliseList(aa)
def train(self, dataset, summaries, groups, train_groups, targets, lrate, edge=False, sorted_idxs=None): if train_groups is not None: ### in topic features = readFeatures(self.feature_types, dataset, summaries, groups, train_groups) assert features.shape[0] % len(train_groups) == 0 else: ### cross-topic features = None all_topics = [] for dd in dataset: topics = sorted( set([gg.split('-')[1] for gg in groups if dd in gg])) all_topics.extend(topics) ff = readFeatures(self.feature_types, dd, summaries, groups, topics) if features is None: features = np.copy(ff) else: features = np.append(features, ff, axis=0) assert features.shape[0] == targets.shape[0] ### initialise model if 'entropy' in self.loss_type: self.model = PrefModel(features.shape[1]) else: self.model = RegModel(features.shape[1]) ### select 15% data as dev dev_groups = [] if train_groups is not None: avai_groups = train_groups else: avai_groups = all_topics while len(dev_groups) < 0.15 * len(avai_groups): gg = random.randint(0, len(avai_groups) - 1) if gg not in dev_groups: dev_groups.append(gg) train_features, train_targets, dev_features, dev_targets = self.getTrainDevData( features, len(avai_groups), dev_groups, targets) ### start training dev_results = [] weights = [] for epoch in range(int(self.epoch_num)): feature_list = [] pref_list = [] for batch in range(self.batch_size): if 'mse' in self.loss_type: break if 'no' in edge.lower(): delta_feature, pref = self.randomPairSampler( features, len(avai_groups), dev_groups, targets) else: delta_feature, pref = self.edgePairSampler( features, len(avai_groups), dev_groups, targets, sorted_idxs, edge) feature_list.append(delta_feature) pref_list.append(pref) if 'entropy' in self.loss_type: self.entropy_train(np.array(feature_list), np.array(pref_list), lrate) elif 'hinge' in self.loss_type: self.hinge_train(np.array(feature_list), np.array(pref_list), lrate) elif 'relative' in self.loss_type: self.rel_train(np.array(feature_list), np.array(pref_list), lrate) else: assert 'mse' in self.loss_type pointer = 0 while pointer < len(train_features): #print('pointer', pointer, 'train_features size', len(train_features)) ff = train_features[ pointer:min(pointer + self.batch_size, len(train_features))] tt = train_targets[ pointer:min(pointer + self.batch_size, len(train_features))] self.mse_train(np.array(ff), np.array(tt), lrate) pointer += self.batch_size devr = self.evaluateOnDev(dev_groups, dev_features, dev_targets) dev_results.append(devr) print('epoch {}, loss {}'.format(epoch, devr)) weights.append(copy.deepcopy(self.model.state_dict())) self.best_weights = weights[dev_results.index(min(dev_results))]
def cv(datasets, features, summaries, targets, topic_lists, gg, sentences_of_topics, sorted_idxs_list, learner_type='linear-pref', cv_fold_num=10, round=1e6, epoch=10, edge_sampling='no', validation_size=0.1, cnn_args={}): print('\n---Data reading finished. Now cross-validation starts---\n') print('Features used: {}'.format(features)) cv_cnt = 0 rewards_dic = {} if 'cnn' in learner_type: token2idx = read_duc_token2idx() summaries_tokens = actions_to_idx(summaries, gg, sentences_of_topics, token2idx) longest_summary = max([len(summary) for summary in summaries_tokens]) summaries_tokens = np.array([ summary + [0] * (longest_summary - len(summary)) for summary in summaries_tokens ]) if cnn_args['feature_count'] > 0: feature_matrix = [] for dd in datasets: topics = sorted( set([gg.split('-')[1] for gg in groups if dd in gg])) ff = readFeatures(features, dd, summaries, groups, topics) if features is None: feature_matrix = np.copy(ff) else: feature_matrix = np.append(feature_matrix, ff, axis=0) ### cross validation for ii in range(len(datasets)): all_result_dic = OrderedDict() test_ds = datasets[ii] train_ds = np.array(datasets)[[ i for i in range(len(datasets)) if i != ii ]] test = np.array([test_ds in g for g in gg]) train = np.array([not tt for tt in test]) print('\n=====CV Fold {}, TRAIN {}, TEST {}====='.format( cv_cnt, train_ds, test_ds)) if 'cnn' in learner_type: validation_groups = [ '{}-{}'.format(datasets[i], topic) for i in range(len(datasets)) if i != ii for topic in topic_list[i][0:int(0.5 * validation_size * len(topic_list[i]))] ] validation = np.array([g in validation_groups for g in gg]) train = np.array( [not tt and not vv for tt, vv in zip(test, validation)]) embedding_dim = 300 filter_sizes = cnn_args['filter_sizes'] filter_map_size = cnn_args['filter_count'] max_out_of_filter_maps = 1 max_out_of_all = cnn_args['final_max_pool'] feature_count = cnn_args['feature_count'] criteria = cnn_args['criteria'] p = 0.5 pretrained_embedding = cnn_args[ 'pretrained_embedding'] if 'pretrained_embedding' in cnn_args else "" if feature_count > 0: feature_train = feature_matrix[train] feature_val = feature_matrix[validation] else: feature_train = None feature_val = None rewarder = cnn.CNNRewarder( len(token2idx) + 1, embedding_dim, filter_sizes, filter_map_size, max_out_of_filter_maps, max_out_of_all, feature_count, p, pretrained_embedding) if criteria == 'mse': trainset = Dataset(summaries_tokens[train], targets[train], feature_train) validationset = Dataset(summaries_tokens[validation], targets[validation], feature_val) elif criteria == 'margin' or criteria == 'margin_rel' or criteria == 'cross_entropy': trainset = PairDataset(int(round), summaries_tokens[train], targets[train], groups[train], feature_train, sorted_idxs_list[train], cnn_args['sampling']) validationset = PairDataset(int(round * validation_size), summaries_tokens[validation], targets[validation], groups[validation], feature_val, sorted_idxs_list[validation], cnn_args['sampling']) elif criteria == 'warp': n = cnn_args['warp_samples'] trainset = TupleDataset(n, round, summaries_tokens[train], targets[train], groups[train], feature_train) validationset = TupleDataset(n, int(round * validation_size), summaries_tokens[validation], targets[validation], groups[validation], feature_val) cnn.train(rewarder, trainset, validationset, epoches=epoch, batch_size=200, criteria=criteria) elif 'reg' in learner_type: rewarder = LinearRegRewarder(features) rewarder.train(train_ds, summaries, groups, None, targets[train]) elif 'pref' in learner_type: rewarder = PrefRewarder(features, round, learner_type.split('-')[0]) rewarder.train(train_ds, summaries, groups, None, targets[train], epoch) ### test if learner_type == 'cnn': test_features = torch.from_numpy( feature_matrix[test]) if feature_count > 0 else None learnt_rewards = cnn.predict(rewarder, torch.from_numpy( summaries_tokens[test]), features=test_features, use_best_model=True, batch_size=200, to_numpy=True) else: learnt_rewards = rewarder.predict(test_ds, summaries, groups, None) rewards_dic[test_ds] = learnt_rewards topics = topic_lists[ii] for it, tt in enumerate(topics): rr = getTopicReward(learnt_rewards, topics, tt) test_result = evaluateReward( list(rr), list(targets[[tt in gg for gg in groups]]), True) addResult(all_result_dic, test_result) print('---Test Results, {} TOPIC {}---'.format(test_ds, tt)) for metric in test_result: print('{} : {}'.format(metric, test_result[metric])) print('\n===AVERAGE REWARD QUALITY FOR {}==='.format(test_ds)) print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'. format(features, learner_type, round, edge_sampling, epoch)) print('cnn args {}'.format(cnn_args)) for metric in all_result_dic: print('{}-mean : {}'.format(metric, np.mean(all_result_dic[metric]))) print('{}-std: {}'.format(metric, np.std(all_result_dic[metric]))) return rewards_dic
def cv(dataset, features, summaries, targets, groups, gg, sentences_of_topics, sorted_idxs_list, learner_type='linear-pref', cv_fold_num=2, round=1e6, epoch=10, edge_sampling='no', validation_size=0.1, cnn_args={}): ### store all results all_test_reward_dic = OrderedDict() pointer = 0 print('\n---Data reading finished. Now cross-validation starts---\n') print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'. format(features, learner_type, round, edge_sampling, epoch)) print('cnn args {}'.format(cnn_args)) cv_cnt = 0 rewards_dic = {} if 'cnn' in learner_type: token2idx = read_duc_token2idx() summaries_tokens = actions_to_idx(summaries, groups, sentences_of_topics, token2idx) longest_summary = max([len(summary) for summary in summaries_tokens]) summaries_tokens = np.array([ summary + [0] * (longest_summary - len(summary)) for summary in summaries_tokens ]) if cnn_args['feature_count'] > 0: feature_matrix = readFeatures(features, dataset, summaries, groups, gg) ### cross validation for ii in range(cv_fold_num): test_groups = gg[int(pointer ):min(int(pointer + len(gg) / float(cv_fold_num)), len(gg))] if 'cnn' in learner_type: train_groups = sorted(list(set(groups) - set(test_groups))) validation_groups = train_groups[0:int(validation_size * len(train_groups))] train_groups = sorted( list(set(train_groups) - set(validation_groups))) validation = np.array([ele in validation_groups for ele in groups]) else: train_groups = sorted(list(set(groups) - set(test_groups))) train = np.array([ele in train_groups for ele in groups]) cv_cnt += 1 pointer = (ii + 1) * len(gg) / float(cv_fold_num) print('\n=====CV Fold {}====='.format(cv_cnt)) if 'cnn' in learner_type: embedding_dim = 300 filter_sizes = cnn_args['filter_sizes'] filter_map_size = cnn_args['filter_count'] max_out_of_filter_maps = 1 max_out_of_all = cnn_args['final_max_pool'] feature_count = cnn_args['feature_count'] criteria = cnn_args['criteria'] pretrained_embedding = cnn_args[ 'pretrained_embedding'] if 'pretrained_embedding' in cnn_args else "" p = 0.5 ae_split = cnn_args['ae_split'] if 'ae_split' in cnn_args else 5 if feature_count > 0: feature_train = feature_matrix[train] feature_val = feature_matrix[validation] else: feature_train = None feature_val = None rewarder = cnn.CNNRewarder( len(token2idx) + 1, embedding_dim, filter_sizes, filter_map_size, max_out_of_filter_maps, max_out_of_all, feature_count, p, pretrained_embedding) if 'mse' in criteria: trainset = Dataset(summaries_tokens[train], targets[train], feature_train) validationset = Dataset(summaries_tokens[validation], targets[validation], feature_val) elif 'margin' in criteria or criteria == 'cross_entropy': trainset = PairDataset(int(round), summaries_tokens[train], targets[train], groups[train], feature_train, sorted_idxs_list[train], cnn_args['sampling']) validationset = PairDataset(int(round * validation_size), summaries_tokens[validation], targets[validation], groups[validation], feature_val, sorted_idxs_list[validation], cnn_args['sampling']) elif criteria == 'warp': n = cnn_args['warp_samples'] trainset = TupleDataset(n, round, summaries_tokens[train], targets[train], groups[train], feature_train) validationset = TupleDataset(n, int(round * validation_size), summaries_tokens[validation], targets[validation], groups[validation], feature_val) cnn.train(rewarder, trainset, validationset, epoches=epoch, batch_size=200, criteria=criteria, split=ae_split) elif 'reg' in learner_type: rewarder = LinearRegRewarder(features) rewarder.train(dataset, summaries, groups, train_groups, targets[train]) elif 'pref' in learner_type: rewarder = PrefRewarder(features, round, learner_type.split('-')[0]) rewarder.train(dataset, summaries, groups, train_groups, targets[train], epoch, edge_sampling, sorted_idxs_list[train]) ### test weights_added = False for tg in test_groups: test = np.array([ele == tg for ele in groups]) if learner_type == 'cnn': test_features = torch.from_numpy( feature_matrix[test]) if feature_count > 0 else None learnt_rewards = cnn.predict(rewarder, torch.from_numpy( summaries_tokens[test]), features=test_features, use_best_model=True, batch_size=200, to_numpy=True) else: learnt_rewards = rewarder.predict(dataset, summaries, groups, [tg]) rewards_dic[tg] = learnt_rewards test_result = evaluateReward(list(learnt_rewards), list(targets[test]), True) if not weights_added: weights_added = True if learner_type == 'cnn': test_result[ 'weights'] = rewarder.combination.weight.data.numpy( ).copy() elif 'reg' in learner_type: test_result['weights'] = rewarder.lin_reg.coef_.copy() elif 'pref' in learner_type: test_result['weights'] = rewarder.rank_learner.coef_.copy() #rmse,temp = plotAgreement(targets[test],learnt_rewards,plot=False,bin_num=bin_num) #test_result['rmse-bin{}'.format(bin_num)] = rmse #test_result['temperature-bin{}'.format(bin_num)] = temp addResult(all_test_reward_dic, test_result) print('---Test Results, TOPIC {}---'.format(tg)) for metric in test_result: print('{} : {}'.format(metric, test_result[metric])) print('\n====={}, AVERAGE PERFORMANCE OVER {}-FOLD CV====='.format( dataset, cv_cnt)) print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'. format(features, learner_type, round, edge_sampling, epoch)) print('cnn args {}'.format(cnn_args)) print('---Test Results---') for metric in all_test_reward_dic: if metric == 'weights': print('{} mean : {}'.format( metric, np.mean(all_test_reward_dic[metric], 0))) print('{} std : {}'.format(metric, np.std(all_test_reward_dic[metric], 0))) else: print('{} mean : {}'.format(metric, np.mean(all_test_reward_dic[metric]))) print('{} std : {}'.format(metric, np.std(all_test_reward_dic[metric]))) return rewards_dic
reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset) ### store all results all_test_reward_dic = OrderedDict() topic_cnt = 0 feature_type = ['infersent_max'] ### read data for topic, docs, models in data: topic_cnt += 1 summs, ref_values_dic = readSummaries(dataset, topic, 'rouge', sample_num) ref_rewards = aggregateScores(ref_values_dic) groups = [topic] * len(summs) features = readFeatures(feature_type, dataset, np.array(summs), groups, [topic]) features = features.reshape(1, -1)[0] rr = evaluateReward(list(features), ref_rewards, True) print('\n\n===TOPIC {}: {}==='.format(topic_cnt, topic)) addResult(all_test_reward_dic, rr) for metric in rr: print('{}:\t{}'.format(metric, rr[metric])) print('\n\n===TYPE {} AVERAGE OVER {} TOPICS==='.format( feature_type[0], topic_cnt)) for metric in all_test_reward_dic: print('{}:\t{}'.format(metric, rr[metric]))
def correlation(features, sizes): names = [] for f, s in zip(features, sizes): if s > 1: for i in range(s): names.append(f + str(i)) else: names.append(f) names.append('rouge_reward') dataset = 'DUC2001' ## DUC2001, DUC2002, DUC2004 sample_num = 9999 bin_num = 20 cv_fold_num = 10 ### read documents and ref. summaries reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset) topic_cnt = 0 summaries = [] groups = [] models_list = [] docs_list = [] targets = [] ### read data for topic, docs, models in data: print('read DATA {}, TOPIC {}'.format(dataset, topic)) summs, ref_values_dic = readSummaries(dataset, topic, 'rouge', sample_num) print('num of summaries read: {}'.format(len(summaries))) ref_rewards = aggregateScores(ref_values_dic) models_list.append(models) docs_list.append(docs) summaries.extend(summs) groups.extend([topic] * len(summs)) targets.extend(ref_rewards) topic_cnt += 1 allFeatures = readFeatures(features, dataset, np.array(summaries), groups, set(groups)) allFeatures = np.c_[allFeatures, np.array(targets)] correlations = {} threshold_correlation = {} for col1, col2 in itertools.combinations(range(len(names)), 2): pcc = pearsonr(allFeatures[:, col1], allFeatures[:, col2])[0] correlations[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc # other way for ease of reading correlations[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc if pcc < -0.8: threshold_correlation[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc threshold_correlation[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc #for key in sorted(correlations.keys()): # print(key+str(correlations[key])) print("Pairs with pcc >.9") for key in sorted(threshold_correlation.keys()): print(key + str(threshold_correlation[key]))
def train(self, dataset, summaries, groups, train_groups, targets, epoch=20, edge='no', sorted_idxs=None): if train_groups is not None: features = readFeatures(self.feature_types, dataset, summaries, groups, train_groups) assert features.shape[0] % len(train_groups) == 0 else: features = None all_topics = [] for dd in dataset: topics = sorted( set([gg.split('-')[1] for gg in groups if dd in gg])) all_topics.extend(topics) ff = readFeatures(self.feature_types, dd, summaries, groups, topics) if features is None: features = np.copy(ff) else: features = np.append(features, ff, axis=0) assert features.shape[0] == targets.shape[0] ''' print('before random feature, ', features.shape) random_feature = np.array([np.random.random() for i in range(features.shape[0])]) print('random feature size', random_feature.shape) features = np.c_[features,random_feature] print('after random feature, ', features.shape) ''' ### select 15% data as dev dev_groups = [] if train_groups is not None: avai_groups = train_groups else: avai_groups = all_topics while len(dev_groups) < 0.15 * len(avai_groups): gg = random.randint(0, len(avai_groups) - 1) if gg not in dev_groups: dev_groups.append(gg) self.getDevResult(features, len(avai_groups), dev_groups, targets, True) cnt = 0 feature_list = [] pref_list = [] dev_results = [] weights = [] step = self.round / epoch while cnt < self.round: if 'no' in edge.lower(): delta_feature, pref = self.randomPairSampler( features, len(avai_groups), dev_groups, targets) else: delta_feature, pref = self.edgePairSampler( features, len(avai_groups), dev_groups, targets, sorted_idxs, edge) feature_list.append(delta_feature) pref_list.append(pref) cnt += 1 if (cnt) % step == 0: self.rank_learner.fit(np.array(feature_list), np.array(pref_list)) rr = self.getDevResult(features, len(avai_groups), dev_groups, targets, False) dev_results.append(rr) print('pair {}, ndcg at 10% {}'.format(cnt, rr)) weights.append(self.rank_learner.coef_[0]) if len(weights) == 0: self.rank_learner.fit(np.array(feature_list), np.array(pref_list)) weights.append(self.rank_learner.coef_[0]) self.best_weights = weights[dev_results.index(max(dev_results))]