def get_softcopy_ids(input, output, sent_length, topN, related_words_dict=None): """ Given input and output text, we get a padded soft copy target numpy array with a pre-calculated related_words_dict dict. """ if isinstance(input, str): input = NLP(input) if isinstance(output, str): output = NLP(output) input_tokens = set([token.text.lower() for token in input]) input_lemmas = set([token.lemma_ for token in input]) input_synonyms = set() input_antomyms = set() input_semantic_related = set() for token in input: if related_words_dict is not None: synonyms = set(related_words_dict[token.text]["synonyms"]) antonyms = set(related_words_dict[token.text]["antonyms"]) semantic_related = set(related_words_dict[token.text]["semantic_related"][:topN]) else: synonyms = set(get_synonyms(token.text)) antonyms = set(get_antonyms(token.text)) semantic_related = set(get_semantic_related_words(token.text, topN)) input_synonyms = set.union(input_synonyms, synonyms) input_antomyms = set.union(input_antomyms, antonyms) input_semantic_related = set.union(input_semantic_related, semantic_related) is_copy_padded = np.zeros([sent_length], dtype=np.float32) i = 0 for token in output: if token.text.lower() in input_tokens: # hard copy is_copy_padded[i] = 1 elif token.lemma_ in input_lemmas: # variant form is_copy_padded[i] = 2 elif token.text.lower() in input_synonyms: # synonym is_copy_padded[i] = 3 elif token.text.lower() in input_antomyms: # antonym is_copy_padded[i] = 4 elif token.text.lower() in input_semantic_related or token.text in input_semantic_related: # semantic related is_copy_padded[i] = 5 else: pass i += 1 if i >= sent_length: break return is_copy_padded
def text2tokens(sentence): """ Transform text to tokens list. :param sentence: input text :return: list of token texts """ doc = NLP(sentence) return [token.text for token in doc]
def get_answer_ner_tag(context, answer_text, processed_by_spacy=False): label = 'UNK' if not processed_by_spacy: doc = NLP(context) else: doc = context for ent in doc.ents: if ent.text == answer_text or answer_text in ent.text: return ent.label_ return label
def get_content_ids(text, function_words_list, sent_length): """ Get a padded binary numpy array to indicate which part of input text tokens are content tokens. """ if isinstance(text, str): text = NLP(text) is_content = separate_content_function_words(text, function_words_list) is_content_padded = np.zeros([sent_length], dtype=np.float32) is_content_padded[:min(len(is_content), sent_length)] = is_content[:sent_length] return is_content_padded
def select_clues(context, answer, answer_bio_ids, max_dependency_distance, processed_by_spacy=False): """ Select clue chunks given context and answer. """ # return a list of [(clue_text, clue_binary_ids)] tuples. answer_start = answer_bio_ids.index('B') try: answer_end = list(reversed(answer_bio_ids)).index('I') answer_end = len(answer_bio_ids) - 1 - answer_end except: answer_end = answer_start if not processed_by_spacy: doc = NLP(context) else: doc = context doc_token_list = [token for token in doc] # text_str = ' '.join([tk.text for tk in doc]) idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list) clue_flags = [0] * len(doc) for aid in range(answer_start, answer_end + 1): sort_related = idx2related[aid] for tk_id, path in sort_related: if (tk_id < answer_start or tk_id > answer_end) and len(path) <= max_dependency_distance: cur_clue = idx2token[tk_id] if cur_clue.pos_ not in ['ADP', 'DET', 'ADV', 'PUNCT', 'PART']: clue_flags[tk_id] = 1 clues = [] i = 0 while i < len(clue_flags): if clue_flags[i] == 0: i += 1 continue j = i while j < len(clue_flags): if clue_flags[j] == 1: j += 1 else: break clue_text = ' '.join(context_tokens[i:j]) clue_binary_ids = [0] * len(clue_flags) clue_binary_ids[i:j] = [1] * (j - i) clues.append({"clue_text": clue_text, "clue_binary_ids": clue_binary_ids}) i = j return clues
def get_chunks(sentence, doc=None): """ Input a sentence, output a list of its chunks (ner_tag, pos_tag, leaves_without_position, st, ed). Such as ('PERSON', 'NP', ['Beyoncé', 'Giselle', 'Knowles-Carter'], 0, 2). """ tree = None try: tree = PARSER.parse( sentence) # NOTICE: when sentence is too long, it will have error. except Exception as e: print(e) ## DELETE pass if doc is None: doc = NLP(sentence) max_depth, node_num, orig_chunklist = _navigate(tree) chunklist = [] # parse the result of _navigate for chunk in orig_chunklist: try: if chunk[0] == 'word': continue chunk_pos_tag, leaves = chunk leaves_without_position = [] position_list = [] for v in leaves: tmp = v.split('___') wd = tmp[0] index = int(tmp[1]) leaves_without_position.append(wd) position_list.append(index) st = position_list[0] ed = position_list[-1] chunk_ner_tag = "UNK" chunk_text = " ".join(leaves_without_position) for ent in doc.ents: if ent.text == chunk_text or chunk_text in ent.text: chunk_ner_tag = ent.label_ chunklist.append((chunk_ner_tag, chunk_pos_tag, leaves_without_position, st, ed)) except: continue return chunklist, tree, doc
def select_answers(context, processed_by_spacy=False): """ Input a context, we select which part of the input words belonging to the answer. """ # return a list of [(answer_text, answer_bio_ids)] tuples. tree = None try: tree = PARSER.parse( context) # TODO: if the context is too long, it will cause error. except: pass if not processed_by_spacy: doc = NLP(context) else: doc = context token2idx, idx2token = get_token2char(doc) max_depth, node_num, chunklist = _navigate(tree) answer_chunks = _post(chunklist) answers = [] for chunk in answer_chunks: label, leaves, st, ed = chunk # print('leaves={}\tst={}\ted={}\tlabel={}'.format(leaves, st, ed, label)) try: char_st, char_ed = str_find(context, leaves) if char_st < 0: continue answer_text = context[char_st:char_ed + 1] st = idx2token[char_st] ed = idx2token[char_ed] answer_bio_ids = ['O'] * len(doc) answer_bio_ids[st:ed + 1] = ['I'] * (ed - st + 1) answer_bio_ids[st] = 'B' char_st = token2idx[st][0] char_ed = token2idx[ed][1] # print('answer_text={}\tchar_st={}\tchar_ed={}\tst={}\ted={}'.format(answer_text, char_st, char_ed, st, ed)) except: continue answers.append((answer_text, char_st, char_ed, answer_bio_ids, label)) return answers
def get_spacy_processed_examples(config, augmented_sentences, debug=False, debug_length=20, shuffle=False): print("Start transform augmented sentences to spaCy processed examples...") start = datetime.now() examples = [] i = 0 for s in tqdm(augmented_sentences): if "ans_sent_doc" not in s: s["ans_sent_doc"] = NLP(s["context"]) s["ans_sent_tokens"] = [token.text for token in s["ans_sent_doc"]] examples.append(s) i = i + 1 if debug and i >= debug_length: break if shuffle: random.shuffle(augmented_sentences) print(("Time of get spaCy processed examples: {}").format(datetime.now() - start)) print("Number of spaCy processed examples: ", len(examples)) return examples
def get_spacy_processed_examples(config, raw_examples, debug=False, debug_length=20, shuffle=False): """ Get a list of spaCy processed examples given raw examples. """ print("Start transform raw examples to spaCy processed examples...") start = datetime.now() examples = [] eval_examples = [] meta = {} meta["num_q"] = 0 for t in QUESTION_TYPES: meta[t] = 0 for e in tqdm(raw_examples): meta["num_q"] += 1 ans_sent = normalize_text(e["ans_sent"]) ans_sent_doc = NLP(ans_sent) ans_sent_tokens = [token.text for token in ans_sent_doc] ans_sent_chars = [list(token) for token in ans_sent_tokens] spans = get_token_char_level_spans(ans_sent, ans_sent_tokens) ans_sent_syntactic_edges = get_dependency_tree_edges(ans_sent_doc) ques = normalize_text(e["question"]) ques = "<sos> " + ques + " <eos>" # notice: this is important for QG ques_doc = NLP(ques) ques_tokens = [token.text for token in ques_doc] ques_chars = [list(token) for token in ques_tokens] ques_type, ques_type_id = get_question_type(e["question"]) meta[ques_type] += 1 answer_text = normalize_text(e["answer_text"]) answer_start = e["answer_start"] answer_end = answer_start + len(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1_in_sent, y2_in_sent = answer_span[0], answer_span[-1] answer_in_sent = " ".join(ans_sent_tokens[y1_in_sent:y2_in_sent + 1]) example = { "question": ques, "ques_doc": ques_doc, "ques_tokens": ques_tokens, "ques_chars": ques_chars, "ques_type": ques_type, # string type "ques_type_id": ques_type_id, # type id number "ans_sent": ans_sent, "ans_sent_doc": ans_sent_doc, "ans_sent_tokens": ans_sent_tokens, "ans_sent_chars": ans_sent_chars, "ans_sent_syntactic_edges": ans_sent_syntactic_edges, "answer": answer_in_sent, "y1_in_sent": y1_in_sent, "y2_in_sent": y2_in_sent, "id": meta["num_q"] } examples.append(example) if debug and meta["num_q"] >= debug_length: break if shuffle: random.shuffle(examples) print(("Time of get spaCy processed examples: {}").format(datetime.now() - start)) print("Number of spaCy processed examples: ", len(examples)) return examples, meta, eval_examples
e = [token.head.i, token.i, token.dep_] edges.append(e) return edges if __name__ == "__main__": text = "<sos> How are you? <eos>" text2 = "Apple is how are you three" counters = {} counters["word"] = Counter() counters["char"] = Counter() counters["pos"] = Counter() counters["ner"] = Counter() counters["iob"] = Counter() counters["dep"] = Counter() spacy_doc = NLP(text) spacy_doc2 = NLP(text2) print(spacydoc2bpe(spacy_doc2)) for token in spacy_doc: counters["word"][token.text] += 1 for char in token.text: counters["char"][char] += 1 counters["pos"][token.tag_] += 1 counters["ner"][token.ent_type_] += 1 counters["iob"][token.ent_iob_] += 1 counters["dep"][token.dep_] += 1 emb_mats = {} emb_dicts = {} emb_mats["word"], emb_dicts["word"] = get_embedding(counters["word"], "word", vec_size=3)
def get_clue_info(question, sentence, answer, answer_start, chunklist=None, y1_in_sent=None, doc=None, ques_doc=None, sent_limit=100): example = { "question": question, "ans_sent": sentence, "answer_text": answer} if doc is None: doc = NLP(sentence) if ques_doc is None: ques_doc = NLP(question) if chunklist is None: chunklist, _, _ = get_chunks(sentence, doc) example["ans_sent_tokens"] = [token.text for token in doc] example["ques_tokens"] = [token.text for token in ques_doc] example["ans_sent_doc"] = doc if y1_in_sent is None: spans = get_token_char_level_spans(sentence, example["ans_sent_tokens"]) answer_end = answer_start + len(answer) answer_span = [] for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1_in_sent = answer_span[0] answer_start = y1_in_sent doc_token_list = [token for token in doc] idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list) clue_rank_scores = [] for chunk in chunklist: candidate_clue = chunk[2] # list of chunk words ques_lower = " ".join(example["ques_tokens"]).lower() candidate_clue_text = " ".join(candidate_clue).lower() ques_lemmas = [t.lemma_ for t in ques_doc] sent_lemmas = [t.lemma_ for t in doc] ques_tokens = [t.lower() for t in example["ques_tokens"]] candidate_clue_is_content = [int(w.lower() not in FUNCTION_WORDS_LIST) for w in candidate_clue] candidate_clue_lemmas = sent_lemmas[chunk[3]:chunk[4] + 1] candidate_clue_content_lemmas = [candidate_clue_lemmas[i] for i in range(len(candidate_clue_lemmas)) if candidate_clue_is_content[i] == 1] candidate_clue_lemmas_in_ques = [candidate_clue_lemmas[i] for i in range(len(candidate_clue_lemmas)) if candidate_clue_lemmas[i] in ques_lemmas] candidate_clue_content_lemmas_in_ques = [candidate_clue_content_lemmas[i] for i in range(len(candidate_clue_content_lemmas)) if candidate_clue_content_lemmas[i] in ques_lemmas] candidate_clue_tokens_in_ques = [candidate_clue[i] for i in range(len(candidate_clue)) if candidate_clue[i].lower() in ques_tokens] candidate_clue_content_tokens = [candidate_clue[i] for i in range(len(candidate_clue)) if candidate_clue_is_content[i] == 1] candidate_clue_content_tokens_in_ques = [candidate_clue_content_tokens[i] for i in range(len(candidate_clue_content_tokens)) if candidate_clue_content_tokens[i].lower() in ques_tokens] candidate_clue_content_tokens_in_ques_soft = candidate_clue_content_tokens_in_ques # !!!! TODO: soft in. score = 0 if (len(candidate_clue_lemmas_in_ques) == len(candidate_clue_lemmas) or len( candidate_clue_tokens_in_ques) == len(candidate_clue)) and \ sum(candidate_clue_is_content) > 0 and \ candidate_clue[0].lower() not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE: score += len(candidate_clue_content_lemmas_in_ques) score += len(candidate_clue_content_tokens_in_ques) score += len(candidate_clue_content_tokens_in_ques_soft) score += int(candidate_clue_text in ques_lower) clue_rank_scores.append(score) # print("______".join([str(score), " ".join(chunk[2])])) #!!!!!!!!! for debug if len(clue_rank_scores) == 0 or max(clue_rank_scores) == 0: clue_chunk = None clue_pos_tag = "UNK" clue_ner_tag = "UNK" clue_length = 0 clue_answer_dep_path_len = -1 selected_clue_binary_ids_padded = np.zeros([sent_limit], dtype=np.float32) else: clue_chunk = chunklist[clue_rank_scores.index(max(clue_rank_scores))] clue_pos_tag = clue_chunk[1] clue_ner_tag = clue_chunk[0] clue_length = clue_chunk[4] - clue_chunk[3] + 1 clue_start = clue_chunk[3] clue_end = clue_chunk[4] clue_answer_dep_path_len = abs(clue_start - answer_start) answer_related = idx2related[answer_start] for tk_id, path in answer_related: if tk_id == clue_start: clue_answer_dep_path_len = len(path) selected_clue_binary_ids_padded = np.zeros([sent_limit], dtype=np.float32) if clue_start < sent_limit and clue_end < sent_limit: selected_clue_binary_ids_padded[clue_start:clue_end + 1] = 1 clue_info = { "clue_pos_tag": clue_pos_tag, "clue_ner_tag": clue_ner_tag, "clue_length": clue_length, "clue_chunk": clue_chunk, "clue_answer_dep_path_len": clue_answer_dep_path_len, "selected_clue_binary_ids_padded": selected_clue_binary_ids_padded, } return clue_info
def get_processed_examples(raw_examples, debug=False, debug_length=20, shuffle=False): """ Get a list of spaCy processed examples given raw examples. """ print("Start transform raw examples to processed examples...") start = datetime.now() examples = [] meta = {} meta["num_q"] = 0 num_spans_len_error = 0 num_not_match_error = 0 for e in tqdm(raw_examples): # paragraph info (here is sentence) ans_sent = normalize_text(e["ans_sent"]) ans_sent_doc = NLP(ans_sent) ans_sent_tokens = [token.text for token in ans_sent_doc] spans = get_token_char_level_spans(ans_sent, ans_sent_tokens) # question info ques = normalize_text(e["question"]) # ques = "<sos> " + ques + " <eos>" # notice: this is important for QG ques_doc = NLP(ques) ques_type, ques_type_id = get_question_type(e["question"]) # answer info answer_text = normalize_text(e["answer_text"]) answer_start = e["answer_start"] answer_end = answer_start + len(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1_in_sent, y2_in_sent = answer_span[0], answer_span[-1] answer_in_sent = " ".join(ans_sent_tokens[y1_in_sent:y2_in_sent + 1]) # clue info clue_info = FQG_data_augmentor.get_clue_info(ques, ans_sent, answer_in_sent, None, chunklist=None, y1_in_sent=y1_in_sent, doc=ans_sent_doc, ques_doc=ques_doc, sent_limit=100) ans_sent_is_clue = clue_info["selected_clue_binary_ids_padded"] clue_token_position = np.where(ans_sent_is_clue == 1)[0] if clue_info["clue_chunk"] is not None: clue_tokenized_text = " ".join(clue_info["clue_chunk"][2]) else: clue_tokenized_text = None if len(clue_token_position) > 0 and clue_tokenized_text is not None: clue_start_token_idx = clue_token_position[0] clue_end_token_idx = clue_token_position[-1] if len( spans ) > clue_end_token_idx: # !!! NOTICE: sometimes we have len(spans) <= clue_end_token_idx. Need further debug... clue_start = spans[clue_start_token_idx][0] clue_end = spans[clue_end_token_idx][1] # [) clue_text = ans_sent[clue_start:clue_end] else: num_spans_len_error += 1 clue_text = None if clue_text != clue_tokenized_text: if clue_text is not None: num_not_match_error += 1 clue_start = ans_sent.find(clue_tokenized_text) clue_end = clue_start + len(clue_tokenized_text) if clue_start > 0: # not -1 clue_text = clue_tokenized_text # print("clue_text revised: ", clue_text) # print("clue_start revised: ", clue_start) else: continue else: clue_text = None clue_start = None # final example example = { "paragraph": ans_sent, "question": ques, "ques_type": ques_type, # string type "answer": answer_text, "answer_start": answer_start, "clue": clue_text, "clue_start": clue_start, "para_id": meta["num_q"] } examples.append(example) meta["num_q"] += 1 if debug and meta["num_q"] >= debug_length: break if shuffle: random.shuffle(examples) print("num_not_match_error: ", num_not_match_error) print("num_spans_len_error: ", num_spans_len_error) print( ("Time of get processed examples: {}").format(datetime.now() - start)) print("Number of processed examples: ", len(examples)) return examples
sentences = [ "Mary has lived in England for ten years.", "He's going to fly to Chicago next week.", "I don't understand this chapter of the book.", "The children will be swimming in the ocean at five o'clock.", "John had eaten lunch before his colleague arrived.", "The best time to study is early in the morning or late in the evening.", "The trees along the river are beginning to blossom.", "Our friends called us yesterday and asked if we'd like to visit them next month.", "You'll be happy to know that she's decided to take the position.", "I won't give away your secret." ] for sentence in sentences: print(sentence) spacy_doc = NLP(sentence) print(separate_content_function_words(spacy_doc, FUNCTION_WORDS_LIST)) # [[2, 5]] # Mary has lived in England for ten years. # [1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.] # He's going to fly to Chicago next week. # [0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.] # I don't understand this chapter of the book. # [0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.] # The children will be swimming in the ocean at five o'clock. # [1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0.] # John had eaten lunch before his colleague arrived. # [1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.] # The best time to study is early in the morning or late in the evening. # [1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1.]