def process_corpus(corpus_name, to_download=TO_DOWNLOAD, min_wc_source=MIN_WC_SOURCE, max_wc_source=MAX_WC_SOURCE, min_wc_target=MIN_WC_TARGET, max_wc_target=MAX_WC_TARGET, source_filter=SOURCE_FILTER, target_filter=TARGET_FILTER, text_cols=TEXT_COLS, data_dir=DATA_DIR): if to_download: corpus = Corpus(download(corpus_name, data_dir=data_dir)) else: corpus = Corpus(os.path.join(data_dir, corpus_name)) corpus_name = corpus.get_meta()['name'] print(corpus_name) corpus.print_summary_stats() print('processing', corpus.get_meta()['name']) corpus.load_info('utterance', ['parsed']) corpus = text_prep_pipe().transform(corpus) source_df, target_df = get_train_subset(corpus, min_wc_source, max_wc_source, min_wc_target, max_wc_target, source_filter, target_filter, text_cols) source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'), sep='\t') target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'), sep='\t')
# -*- coding: utf-8 -*- """ Created on Mon Jul 27 23:20:11 2020 @author: kach """ from convokit import Corpus, download corpus = Corpus(filename=download("conversations-gone-awry-corpus")) corpus.print_summary_stats() reviews = open("data/reviews.txt", "w", encoding="utf-8") label = open("data/labels.txt", "w") #i=0 for utt in corpus.iter_utterances(): #i+=1 txt = str(utt.text).replace('\n', ' ') reviews.write(txt + '\n') if utt.meta['comment_has_personal_attack']: l = '1' else: l = '0' label.write(l + '\n') #if i>10: # break reviews.close() label.close()
def get_examples(self, filename, ns_name, anserini_folder, sent_bert_model, loss, output_dir, input_pair=True, eval_data=False, denoise_negatives=False, num_ns_for_denoising=100, generative_model = 'facebook/blenderbot-3B', remove_cand_subsets=True, last_utterance_only=False, use_external_corpus=False): """ filename specified which data split to use (train.csv, dev.csv, test.csv). """ filepath = os.path.join(self.dataset_folder, filename) self.data = pd.read_csv(filepath, sep="\t") if denoise_negatives: num_ns = num_ns_for_denoising else: num_ns = 10 candidates = list(self.data["response"].values) if use_external_corpus: external_datasets = [ 'movie-corpus', 'wiki-corpus', 'subreddit-Ubuntu', 'subreddit-microsoft', 'subreddit-apple', 'subreddit-Database', 'subreddit-DIY', 'subreddit-electronics', 'subreddit-ENGLISH', 'subreddit-gis', 'subreddit-Physics', 'subreddit-scifi', 'subreddit-statistics', 'subreddit-travel', 'subreddit-worldbuilding' ] for ds_name in external_datasets: corpus = Corpus(download(ds_name)) corpus.print_summary_stats() for utt in corpus.iter_utterances(): if utt.text != "": candidates.append(utt.text) if ns_name == "random" or eval_data: self.negative_sampler = negative_sampling.RandomNegativeSampler(candidates, num_ns) elif ns_name == "bm25": index_folder = "/anserini_train_-1/" if use_external_corpus: index_folder = index_folder.replace("train", "train_expanded_") self.negative_sampler = negative_sampling.BM25NegativeSamplerPyserini(candidates, num_ns, self.dataset_folder+index_folder, -1, anserini_folder) elif ns_name == "sentence_transformer": self.negative_sampler = negative_sampling.SentenceBERTNegativeSampler(candidates, num_ns, self.dataset_folder+"/train_sentenceBERTembeds", -1, sent_bert_model, large_index=use_external_corpus) elif ns_name == "generative": self.negative_sampler = negative_sampling.GenerativeNegativeSamplerForDialogue(num_ns, generative_model) if loss == 'MarginMSELoss': self.negative_sampler.score_relevant_docs = True if loss == "ContrastiveLoss" and not eval_data: input_pair = False if loss == "OnlineContrastiveLoss" and not eval_data: input_pair = False examples = [] scores_df = [] # Code used to annotate some samples # samples_to_annotate = [] # self.data = self.data.sample(200, random_state=42) # self.negative_sampler.score_relevant_docs = True count_ns_part_of_context = 0 for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))): context = row[0] if last_utterance_only: if 'msdialog' in self.dataset_folder: context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip() else: context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip() relevant_response = row[1] if not input_pair: examples.append(InputExample(guid=filename+str(idx)+"_pos", texts=[context, relevant_response], label=1.0)) if ns_name == "bm25" and not eval_data: ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response], max_query_len = 512, normalize_scores = False, rel_doc_id = str(idx)) else: ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response]) rel_score = rel_scores[0] if denoise_negatives: zipped = zip(ns_candidates[-10:], ns_scores[-10:]) else: zipped = zip(ns_candidates, ns_scores) for ns, score_ns in zipped: if remove_cand_subsets and ns.replace("<<<AGENT>>>: ", "") in context: count_ns_part_of_context+=1 else: if input_pair: examples.append(InputExample(texts=[context, relevant_response, ns], label=float(rel_score-score_ns))) scores_df.append(rel_score-score_ns) # samples_to_annotate.append([self.dataset_folder.split("/")[-1], ns_name, context, relevant_response, ns, rel_score, score_ns]) else: examples.append(InputExample(guid=filename+str(idx)+"_neg", texts=[context, ns], label=0.0)) logging.info("{} {} count of ns which are part of the context: {} out of {}.".format(self.dataset_folder.split("/")[-1], ns_name, count_ns_part_of_context, len(examples))) # print(pd.DataFrame(scores_df).describe()) # pd.DataFrame(samples_to_annotate, columns=['task', 'ns', 'context', 'rel_response', 'negative_sample', 'rel_score', 'score_negative']).\ # to_csv(output_dir+"neg_samples_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]), index=False) if loss == 'MarginMSELoss': pd.DataFrame(scores_df).to_csv(output_dir+"MarginScores_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1])) return examples