def load_sentences_for_topic_model(opt): ''' Loads dataset :param opt: :return: ''' # for s in opt['subsets']: # assert ('train' in s) # only use for training data # load dataset from src.loaders.load_data import load_data data_dict = load_data(opt, numerical=False) R1 = data_dict['T1'] R2 = data_dict['T2'] # select sentences from dataset (avoid duplication in Semeval) if opt['dataset'] == 'Semeval': assert opt['tasks'] == ['A', 'B', 'C'] # combine data from all subtasks (A,B,C) Asent_1 = [sent for i, sent in enumerate(R1[0]) if i % 10 == 0] # only once Asent_2 = R2[0] Bsent_1 = [sent for i, sent in enumerate(R1[1]) if i % 10 == 0] # only once Bsent_2 = R2[1] # Csent_1 = [sent for i,sent in enumerate(dataset[12]) if i%100==0] # same as Bsent_! Csent_2 = R2[2] sentences = Asent_1 + Asent_2 + Bsent_1 + Bsent_2 + Csent_2 print(len(sentences)) else: sentences = [s for s in R1[0]] + [s for s in R2[0]] return sentences
def calculate_word_overlap_ratio(opt): ''' Loads dataset defined by opt, calculates word overlap ratio for each sentence pair and returns nested list with overlap ratios in each subset. :param opt: option dictionary to load dataset :return : nested list with overlap ratios ''' data_dict = load_data(opt, numerical=True) E1 = data_dict['E1'] E2 = data_dict['E2'] t = opt['tasks'][0] d = opt['dataset'] colors = ['g', 'b', 'r'] subset_overlap = [] for n, s in enumerate(opt['subsets']): identical_per_pair = [] for i in range(len(E1[n])): # print(line) len_1 = (E1[n][i] != 1).sum() len_2 = (E2[n][i] != 2).sum() total_len = len_1 + len_2 identical_ids = np.intersect1d(E1[n][i], E2[n][i]) left_overlap = np.isin(E1[n][i], identical_ids).sum() right_overlap = np.isin(E2[n][i], identical_ids).sum() overlap_ratio = (left_overlap + right_overlap) / total_len # print(overlap_ratio) identical_per_pair.append(overlap_ratio) plt.hist(identical_per_pair, color=colors[n], alpha=0.5, bins=25, label=s, range=[0, 1]) subset_overlap.append(identical_per_pair) plt.legend(loc='upper right') plt.ylabel('Number of document pairs') plt.xlabel('Lexical overlap ratio') plt.title('Lexical overlap in {} task {}'.format(d, t)) plt.show() plt.close() return subset_overlap
def read_original_data(opt, subset='dev'): ''' Reads original labelled dev file from data directory, extracts get pair_id, gold_label and sentences. :param opt: option log :param subset: ['train','dev','test'] :return: pandas dataframe ''' # adjust filenames in case of increased training data if 'train_large' in opt['subsets']: print('adjusting names') if subset=='dev': subset='test2016' elif subset=='test': subset='test2017' # adjust loading options: opt['subsets'] = [subset] # only specific subset opt['load_ids'] = True # with labels # print(opt) data_dict = load_data(opt,numerical=False) ID1 = data_dict['ID1'][0] # unlist, as we are only dealing with one subset ID2 = data_dict['ID2'][0] R1 = data_dict['R1'][0] R2 = data_dict['R2'][0] L = data_dict['L'][0] # extract get pair_id, gold_label, sentences labeled_data = [] for i in range(len(L)): pair_id = ID1[i]+'-'+ID2[i] gold_label = L[i] s1 = R1[i] s2 = R2[i] labeled_data.append([pair_id,gold_label,s1,s2]) # turn into pandas dataframe cols = ['pair_id','gold_label','s1','s2'] label_df = pd.DataFrame.from_records(labeled_data,columns=cols) return label_df
def get_metric(self, distance_metric, dataset, task='', subset=None): ''' Load calculated metric scores if existing, otherwise calculate :param distance_metric: :param dataset: :param task: :return: nested list with distance/similarity scores depending on metric with outer length of subsets and inner length of example numbers ''' assert distance_metric in self.get_accepted_metrics() subsets = self.get_subsets(dataset) # always load all 3 subsets opt = { 'dataset': dataset, 'datapath': 'data/', 'subsets': subsets, 'tasks': [task], 'n_gram_embd': False, 'cache': True } if dataset == 'Semeval': dataset = '{}_{}'.format(dataset, task) if distance_metric == 'jaccard': if dataset not in self.jaccard.keys(): if dataset not in self.sentence1.keys(): data_dict = load_data(opt, numerical=False) R1 = data_dict['R1'] R2 = data_dict['R2'] pair_ids = [] for s, _ in enumerate(subsets): pair_ids.append([ i1 + '-' + i2 for i1, i2 in zip( data_dict['ID1'][s], data_dict['ID2'][s]) ]) L = data_dict['L'] self.sentence1[dataset] = R1 self.sentence2[dataset] = R2 self.pair_ids[dataset] = pair_ids self.labels[dataset] = L else: R1 = self.sentence1[dataset] R2 = self.sentence2[dataset] self.jaccard[dataset] = calculate_jaccard_index(R1, R2) overlapping = self.jaccard[dataset] elif distance_metric == 'dice': if dataset not in self.dice.keys(): if dataset not in self.sentence1.keys(): data_dict = load_data(opt, numerical=False) R1 = data_dict['R1'] R2 = data_dict['R2'] pair_ids = [] for s, _ in enumerate(subsets): pair_ids.append([ i1 + '-' + i2 for i1, i2 in zip( data_dict['ID1'][s], data_dict['ID2'][s]) ]) L = data_dict['L'] self.sentence1[dataset] = R1 self.sentence2[dataset] = R2 self.pair_ids[dataset] = pair_ids self.labels[dataset] = L else: R1 = self.sentence1[dataset] R2 = self.sentence2[dataset] self.dice[dataset] = calculate_dice_sim(R1, R2) overlapping = self.dice[dataset] elif distance_metric == 'js-div': if dataset not in self.js_div.keys(): if dataset not in self.sentence1.keys(): data_dict = load_data(opt, numerical=False) R1 = data_dict['T1'] R2 = data_dict['T2'] pair_ids = [] for s, _ in enumerate(subsets): pair_ids.append([ i1 + '-' + i2 for i1, i2 in zip( data_dict['ID1'][s], data_dict['ID2'][s]) ]) L = data_dict['L'] self.sentence1[dataset] = R1 self.sentence2[dataset] = R2 self.pair_ids[dataset] = pair_ids self.labels[dataset] = L else: R1 = self.sentence1[dataset] R2 = self.sentence2[dataset] self.js_div[dataset] = calculate_js_div(R1, R2) overlapping = self.js_div[dataset] if subset is None: return overlapping elif subset in ['train', 'p_train', 'train_large']: return overlapping[0] elif subset in ['dev', 'test2016']: return overlapping[1] elif subset in ['test', 'p_test', 'test2017']: return overlapping[-1] else: raise ValueError('{} not accepted value for subset'.format(subset))
examples = [ 441, # Po 89, # Pn 5874, # No 396 ] # Nn elif opt['dataset'] == 'Semeval': examples = [441, 89, 396] elif opt['dataset'] == 'MSRP': examples = [256] # No else: ValueError('Example ids for {} not defined'.format(opt['dataset'])) opt['max_m'] = max(examples) + 1 # prepare input to be passed through network data_dict = load_data(opt) print(data_dict['embd'].shape) subset = 1 # 0 for train, 1 for dev, 2 for test ID1 = data_dict['ID1'][subset] ID2 = data_dict['ID2'][subset] R1 = data_dict['R1'][subset] R2 = data_dict['R2'][subset] E1 = data_dict['E1'][subset] E2 = data_dict['E2'][subset] W_T1 = data_dict['W_T1'] W_T2 = data_dict['W_T2'] D_T1 = data_dict['D_T1'] D_T2 = data_dict['D_T2'] T1 = data_dict['T1'][subset] T2 = data_dict['T2'][subset] L = data_dict['L'][subset]
# 'subsets': ['train_large', 'test2016', 'test2017'], 'topic': 'word', 'topic_type': 'ldamallet', 'padding': False, 'simple_padding': True, 'max_length': 'minimum', 'unk_sub': False, 'lemmatize': False, 'datapath': 'data/', 'dataset': 'Quora', 'tasks': ['B'], 'topic_alpha': 1, 'num_topics': 90, 'threshold': 0.090, 'unk_topic': unk_topic, 'stem': stem } # print(opt) data_dict = load_data(opt, cache=True, write_vocab=False) opt = model(data_dict, opt, logfile='specific_topic_settings.json', print_dim=True) # todo: zeros - no stem # todo: zeros - stem # todo: uniform - stem # data_dict['embd']