def create_markable_for_coref_id_and_str(doc_obj, sent_obj, coref_id, ante_str): tokenized_ante_str = spacy_get_tokenized_word(doc_obj, ante_str) max_len = len(sent_obj.word_list) len_of_ante_str = len(tokenized_ante_str) max_start_idx = -1 max_end_idx = -1 match = False for i in range(0, max_len): #Check if the first token matches if sent_obj.word_list[i].word == tokenized_ante_str[0]: match = True for j in range(1, len_of_ante_str): if sent_obj.word_list[i + j].word != tokenized_ante_str[j]: match = False break if (match == True): #Max pattern is found in the tokenized obj max_start_idx = i max_end_idx = i + len_of_ante_str - 1 break if (max_start_idx != -1) and (max_end_idx != -1): markable_obj = class_defs.markable(max_start_idx, max_end_idx, -1, -1, coref_id, class_defs.MARKABLE_FLAG_ANTECEDENT) sent_obj.gold_markables.append(markable_obj)
def take_care_of_missed_antecedents(doc_obj, sent_obj, sent_num): gold_markables = sent_obj.gold_markables antecedent_markables = [] #Filter the antecedents gm_len = len(gold_markables) for i in range(0, gm_len): marker = gold_markables[i] if (marker.flags == class_defs.MARKABLE_FLAG_ANTECEDENT): antecedent_markables.append(marker) gm_len = len(antecedent_markables) #Nothing is missed if (gm_len == 0): return for i in range(0, gm_len): g_marker = antecedent_markables[i] #Check if this g_marker's identical twin is found in our markable list our_markable_len = len(sent_obj.markables) inserted_or_found = False for j in range(0, our_markable_len): o_marker = sent_obj.markables[j] if ((o_marker.w_s_idx == g_marker.w_s_idx) and (o_marker.w_e_idx == g_marker.w_e_idx)): if (o_marker.flags != g_marker.flags): #Update if there is a mismatch in flags o_marker.flags = class_defs.MARKABLE_FLAG_ANTECEDENT sent_obj.markables[j] = o_marker inserted_or_found = True break elif ((o_marker.w_s_idx > g_marker.w_s_idx)): #We have crossed the place where we should have found this but we still haven't so add it #Create a new marker_obj new_marker = class_defs.markable(g_marker.w_s_idx, g_marker.w_e_idx, -1, -1, g_marker.coref_id, g_marker.flags) sent_obj.markables.insert(j, new_marker) inserted_or_found = True break if (inserted_or_found == False): new_marker = class_defs.markable(g_marker.w_s_idx, g_marker.w_e_idx, -1, -1, g_marker.coref_id, g_marker.flags) sent_obj.markables.append(new_marker)
def spacy_compute_markable_table(sent_obj): len_lst = len(sent_obj.word_list) markable_lst = [] m_start_idx = -1 m_end_idx = -1 for i in range(len_lst): curr_word = sent_obj.word_list[i] pos_tag = curr_word.pos_tag NER_tag = curr_word.NER_tag np_tag = curr_word.chunk_tag if (((np_tag == "B-NP") or (NER_tag == "B")) and (m_start_idx == -1)): m_start_idx = i m_end_idx = i elif ((np_tag == "I-NP") or (NER_tag == "I")): m_end_idx = i else: if (m_start_idx != -1): markable_obj = class_defs.markable(m_start_idx, m_end_idx, -1, -1, 0, 0) markable_lst.append(markable_obj) m_start_idx = -1 m_end_idx = -1 if (pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP" or pos_tag == "WP$" or pos_tag == "NN" or pos_tag == "NNS" or pos_tag == "NNP" or pos_tag == "NNPS"): markable_obj = class_defs.markable(i, i, -1, -1, 0, 0) markable_lst.append(markable_obj) if (m_start_idx != -1): markable_obj = class_defs.markable(m_start_idx, m_end_idx, -1, -1, 0, 0) markable_lst.append(markable_obj) for marker in markable_lst: print("\n", end="") for i in range(marker.w_s_idx, marker.w_e_idx + 1): print(sent_obj.word_list[i].word, " ", end="") return markable_lst
def compute_markable_table(sent_obj): len_lst = len(sent_obj.word_list) markable_lst = [] for i in range(len_lst): curr_word = sent_obj.word_list[i] pos_tag = curr_word.pos_tag NER_tag = curr_word.NER_tag np_tag = curr_word.chunk_tag #Pronoun not part of Noun Phrase if (pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP" or pos_tag == "WP$"): if (np_tag == "O"): markable_obj = class_defs.markable(i, i, -1, -1, 0, 0) markable_lst.append(markable_obj) continue if (pos_tag == "NN" or pos_tag == "NNS" or pos_tag == "NNP" or pos_tag == "NNPS"): if (np_tag == "O"): markable_obj = class_defs.markable(i, i, -1, -1, 0, 0) markable_lst.append(markable_obj) continue if (np_tag != "O"): if (np_tag == "B-NP"): markable_obj = class_defs.markable(i, i, -1, -1, 0, 0) elif (np_tag == "I-NP"): markable_obj.w_e_idx = i if (i < len_lst - 1): if (sent_obj.word_list[i + 1].chunk_tag == "O"): markable_lst.append(markable_obj) elif (i == len_lst - 1): markable_lst.append(markable_obj) return markable_lst
def extract_markables_from_input_file(doc_obj, line_num, sent_tag_unrem, sent_tag_rem): coref_id_string = "" antecedent = None sent_tag_unrem = nltk.word_tokenize(sent_tag_unrem) sent_tag_rem = nltk.word_tokenize(sent_tag_rem) index = 0 extraction = False begin_index = -1 end_index = -1 max_len = len(sent_tag_unrem) number_of_completed_corefs = 0 for index in range(0, max_len): tok = sent_tag_unrem[index] if (tok == "ID="): #Check if this is due to the <COREF ID=X#> if (sent_tag_unrem[index - 2] == "<") and (sent_tag_unrem[index - 1] == "COREF") and (sent_tag_unrem[index + 1] == "''"): coref_id_string = sent_tag_unrem[index + 2] index = index + 5 begin_index = index elif (tok == "<") and (sent_tag_unrem[index + 1] == "/COREF"): antecedent = sent_tag_unrem[begin_index:index] #Note #Compute the index in the tag removed sent tok position # 7 for <S ID= "X"> # 7 for <COREF ID="X#"> # 3 for </COREF> begin_index = begin_index - (number_of_completed_corefs * 10) - 7 - 7 end_index = index - (number_of_completed_corefs * 10) - 7 - 7 - 1 create_markable_flag = True #Debug Prints if (antecedent != sent_tag_rem[begin_index:end_index + 1]): #print ("Mistmatched Antecedent") #print ("Coreference ID ", coref_id_string, "Unremoved Antecedent ", antecedent) #print ("Coreference ID ", coref_id_string, "Removed Antecedent ", sent_tag_rem[begin_index:end_index]) create_markable_flag = False #Create a markable_obj if (create_markable_flag == True): markable_obj = class_defs.markable( begin_index, end_index, -1, -1, coref_id_string, class_defs.MARKABLE_FLAG_ANTECEDENT) sent_obj = doc_obj.sentences[line_num] sent_obj.gold_markables.append(markable_obj) begin_index = -1 number_of_completed_corefs += 1
def handle_key_file(doc_obj, kfp): for line in kfp: line = line.strip('\n') #Patten Check if the string matches for "<COREF ID=" if (len(line) < 2): continue if ("<COREF ID=" in line): tokens = nltk.word_tokenize(line) coref_id_string = tokens[4] #print (coref_id_string) else: list_of_str = [] extract = False string_required = "" for i in range(0, len(line)): if (line[i] == "{"): extract = True elif (line[i] == "}"): extract = False string_required = string_required.lstrip(' ') list_of_str.append(string_required) string_required = "" else: string_required += line[i] #Debug Print #print ("Sentence Num :", list_of_str[0], "Max :", list_of_str[1], "Min :", list_of_str[2]) #Now we got all the anaphorts in a list format, lets do the following tasks # 1. Get the sentence from the doc # 2. Tokenize the max and min # 3. Iterate through the word_list inside sentence and find from which index to index there is a overlap. sentence_obj = doc_obj.sentences[int(list_of_str[0])] tokenized_max = nltk.word_tokenize(list_of_str[1]) tokenized_min = nltk.word_tokenize(list_of_str[2]) max_len = len(sentence_obj.word_list) match = False max_start_idx = -1 max_end_idx = -1 min_start_idx = -1 min_end_idx = -1 len_of_max_str = len(tokenized_max) for i in range(0, max_len): #Check if the first token matches if sentence_obj.word_list[i].word == tokenized_max[0]: match = True for j in range(1, len_of_max_str): if sentence_obj.word_list[i + j].word != tokenized_max[j]: match = False break if (match == True): #Max pattern is found in the tokenized obj max_start_idx = i max_end_idx = i + len_of_max_str - 1 break len_of_min_str = len(tokenized_min) for i in range(0, max_len): #Check if the first token matches if sentence_obj.word_list[i].word == tokenized_min[0]: match = True for j in range(1, len_of_min_str): if sentence_obj.word_list[i + j].word != tokenized_min[j]: match = False break if (match == True): #Max pattern is found in the tokenized obj min_start_idx = i min_end_idx = i + len_of_min_str - 1 break markable_obj = class_defs.markable( max_start_idx, max_end_idx, min_start_idx, min_end_idx, coref_id_string, class_defs.MARKABLE_FLAG_ANAPHOR) sent_obj = doc_obj.sentences[int(list_of_str[0])] sent_obj.gold_markables.append(markable_obj) '''