def process_data(df): print('Formatting data...') data = {} n = df.size data = {} for idx, row in df.iterrows(): if idx > 0 and idx % 10000 == 0: write_data(data) data = {} print('{} / {}'.format(idx, n)) doc_id = '{}'.format(idx) a = {} text = row['ab'] if type(text) != str or len(text) == 0: continue a['title'] = row['ti'] a['text'] = text a['sample_size'] = row['num_randomized'] a['sents'] = utils.sent_tokenize(text) data[doc_id] = a return data
def categorize_message(message): # Название функции не играет никакой роли utterances = sent_tokenize(message.text) response = requests.post(url, json={'sentences': utterances}).json() reply = "\n".join([f"\"{u}\" - {r}" for u, r in zip(utterances, response)]) logging.info( f"Chat id: {message.chat.id} | Utterances: {utterances} | Reply: {reply}" ) bot.send_message(message.chat.id, reply)
def process_t5(): srcf = open(args.out_src, 'w') reff = open(args.out_ref, 'w') decf = open(args.out_dec, 'w') sent_tokenizer = utils.get_sent_tokenizer(tokenizer='spacy') for i, line in enumerate(open(args.infile, 'r')): doc = json.loads(line.strip()) src = utils.sent_list_to_tagged_str( utils.sent_tokenize(doc['article'], sent_tokenizer)) ref = utils.sent_list_to_tagged_str( utils.sent_tokenize(doc['reference'], sent_tokenizer)) dec = utils.sent_list_to_tagged_str( utils.sent_tokenize(doc['decoded'], sent_tokenizer)) srcf.write(src + '\n') reff.write(ref + '\n') decf.write(dec + '\n') print(f"processed line {i}", end='\r') srcf.close() reff.close() decf.close()
def split_data(self, filename): self.load_data(filename) sub_dir = filename.split('-')[0] # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file: # loop over the data for article_id in tqdm.tqdm(range(len(self.data['data']))): list_paragraphs = self.data['data'][article_id]['paragraphs'] # loop over the paragraphs for paragraph in list_paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) qas = paragraph['qas'] # loop over Q/A for qa in qas: question = qa['question'] question = clean_text(question) question_tokens = word_tokenize(question) if question_tokens[-1] != "?" or len( question_tokens ) < config.min_len_question or len( question_tokens) > config.max_len_question: continue if sub_dir == "train": # select only one ground truth, the top answer, if any answer answer_ids = 1 if qa['answers'] else 0 else: answer_ids = len(qa['answers']) if answer_ids: for answer_id in range(answer_ids): answer = qa['answers'][answer_id]['text'] answer = clean_text(answer) answer_tokens = word_tokenize(answer) answer_start = qa['answers'][answer_id][ 'answer_start'] answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate( first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[ idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() # write to file context_file.write(" ".join([ token + u"│" + "1" if idx in answer_span else token + u"│" + "0" for idx, token in enumerate(context_tokens) ]) + "\n") sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n") answer_file.write( " ".join([token for token in answer_tokens]) + "\n")
def split_data(self, filename): self.load_data(filename) envs = ["train", "dev"] for sub_dir in envs: # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file: # loop over the data for article in tqdm.tqdm(self.data["data"]): context = article["text"] context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) if not article["type"] == sub_dir: continue for question in article["questions"]: if question.get("isQuestionBad") == 0 and question[ "consensus"].get("s"): q = question["q"].strip() if q[-1] != "?" or len(q.split( )) < config.min_len_question or len( q.split()) > config.max_len_question: continue answer_start = question["consensus"]["s"] answer = context[question["consensus"]["s"]: question["consensus"]["e"]].strip( ".| ").strip("\n") answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break # write to file sent = [] for idx, token in enumerate(sentence_tokens): if token.strip("\n").strip(): if idx in answer_sentence_span: sent.append(token + u"│" + "1") else: sent.append(token + u"│" + "0") sent = " ".join(sent) sent = sent.strip() index = sent.find("(│0 CNN│0 )│0 --│0 ") if index > -1: sent = sent[index + len("(│0 CNN│0 )│0 --│0 "):] ctxt = [] for idx, token in enumerate(context_tokens): if token.strip("\n").strip(): if idx in answer_span: ctxt.append(token + u"│" + "1") else: ctxt.append(token + u"│" + "0") ctxt = " ".join(ctxt) ctxt = ctxt.strip() index = ctxt.find("(│0 CNN│0 )│0 --│0 ") if index > -1: ctxt = ctxt[index + len("(│0 CNN│0 )│0 --│0 "):] context_file.write(ctxt + "\n") sentence_file.write(sent + "\n") question_file.write(q + "\n") answer_file.write(answer + "\n")
def split_sentence_question(self, filename, data_type): data = self.load_data(filename) with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file: artilces = data for article in tqdm(artilces): paragraphs = article['paragraphs'] for paragraph in paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) question_and_answer_list = paragraph['qas'] for question_and_answer in question_and_answer_list: question = question_and_answer['question'] question = clean_text(question) question_tokens = word_tokenize(question) if len(question_tokens) > MAX_QUESTION_LENGTH or len( question_tokens) < MIN_QUESTION_LENGHT: continue if not question_and_answer['answers']: continue answer = question_and_answer['answers'][0] answer_text = answer['text'] answer_text = clean_text(answer_text) answer_tokens = word_tokenize(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len( sentence_tokens) < MIN_SENTENCE_LENGTH: continue sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n")
sys.setdefaultencoding('utf-8') # define the delimiter danda_ = int("0964", 16) delim = unichr(danda_) # read in the data f = open("data/eng-hin-modified.txt", "r+") s = f.readlines() f.close() sentences = [] # tokenize the whole thing into sentences for line in s[1:2000]: t_ = sent_tokenize(line, delim) t_ = [x for x in t_ if x != "\n"] sentences += t_ # tokenize the whole thing into words words = [] for sent in sentences: tok_ = Tokenizer(sent) tok_.tokenize() words += tok_.tokens unigrams = unigrammatize(words) unigrams = freq_sorted_unigrams(unigrams) #stopwords = [] for gram in unigrams:
return utils.sent_list_to_tagged_str(sents) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-ref", required=True, help="path to ref file") parser.add_argument("-out", required=True, help="path to out file") parser.add_argument("-out_dir", required=True, help="path to output dir: processed/bart") args = parser.parse_args() ref = utils.read_file(args.ref) out = utils.read_file(args.out) out = [line.split('\t')[1].strip() for line in out] assert len(ref) == len(out) ref = [tokenize(line) for line in ref] sent_tokenizer = utils.get_sent_tokenizer(tokenizer='spacy') out = [ utils.sent_list_to_tagged_str(utils.sent_tokenize( line, sent_tokenizer)) for line in out ] assert len(out) == len(ref) with open(join(args.out_dir, "ref.txt"), 'w') as f: f.write('\n'.join(ref) + '\n') with open(join(args.out_dir, "out.txt"), 'w') as f: f.write('\n'.join(out) + '\n')