def get_info_from_file(file_name,MAX=2): pattern = re.compile("(\d+?)\ +(.+?)$") pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$") total = 0 inline = "new" f = open(file_name) print("-----------------------------------------------------------------------------------------------------------------------------") print(file_name) old_2_new_id={} sentence_num = 0 nodes_info = {} candi = {} zps = [] azps = [] while True: line = f.readline() if not line: break line = line.strip() if line == "Leaves:": word_old_id=0 del_num=0 while True: inline = f.readline() if inline.strip() == "":break inline = inline.strip() match = pattern.match(inline) if match: word = match.groups()[1] if word == '[MASK]':#mjj,12.16 del_num+=1 old_2_new_id[str(sentence_num) + "_" + str(word_old_id)] =word_old_id-del_num word_old_id+=1 sentence_num += 1 elif line == "Tree:": candi[sentence_num] = [] nodes_info[sentence_num] = None parse_info = "" inline = f.readline() while True: inline = f.readline() if inline.strip("\n") == "":break if inline.strip()=="":continue parse_info = parse_info + " " + inline.strip() parse_info = parse_info.strip() #a tree each line print(parse_info) parse_info = Tree.parse(parse_info, lower=False)#mjj.12.16,return a tree print(str(parse_info)) nl,wl = parse_analysis.buildTree(str(parse_info)) nodes_info[sentence_num] = (nl,wl) for node in nl: if is_np(node.tag): if node.parent.tag.startswith("NP"): if not (node == node.parent.child[0]): continue leaf_nodes = node.get_leaf() if is_pro(leaf_nodes): continue if is_zero_tag(leaf_nodes): continue candi[sentence_num].append((leaf_nodes[0].index,leaf_nodes[-1].index)) total += 1 for node in wl: if node.word == "*pro*": zps.append((sentence_num,node.index)) elif line.startswith("Coreference chain"): first = True res_info = None last_index = 0 antecedents = [] while True: inline = f.readline() if not inline:break if inline.startswith("----------------------------------------------------------------------------------"): break inline = inline.strip() if len(inline) <= 0:continue if inline.startswith("Chain"): first = True res_info = None last_index = 0 antecedents = [] coref_id = inline.strip().split(" ")[1] else: match = pattern_zp.match(inline) if match: sentence_index = int(match.groups()[0]) begin_word_index = int(match.groups()[1])#mjj,12.16 begin_word_index=old_2_new_id[str(sentence_index)+"_"+str(begin_word_index)] end_word_index = int(match.groups()[2]) end_word_index = old_2_new_id[str(sentence_index) +"_"+ str(end_word_index)] word = match.groups()[-1] if word == "*pro*": is_azp = False if not first: is_azp = True azps.append((sentence_index,begin_word_index,antecedents,coref_id)) if not word == "*pro*": first = False res_info = inline last_index = sentence_index antecedents.append((sentence_index,begin_word_index,end_word_index,coref_id)) if not inline: break return zps,azps,candi,nodes_info
def get_info_from_file(file_name, MAX=2): pattern = re.compile("(\d+?)\ +(.+?)$") pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$") inline = "new" f = open(file_name) doc = document.Doc() sentence_num = 0 while True: line = f.readline() if not line: break line = line.strip() if line == "Leaves:": while True: inline = f.readline() if inline.strip() == "": break inline = inline.strip() match = pattern.match(inline) if match: word = match.groups()[1] sentence_num += 1 elif line == "Tree:": doc.init_sentence(sentence_num) parse_info = "" inline = f.readline() while True: inline = f.readline() if inline.strip("\n") == "": break parse_info = parse_info + " " + inline.strip() parse_info = parse_info.strip() nl, wl = parse_analysis.buildTree(parse_info) index_without_null = 0 all_words_wl = [] for node in wl: if node.word.find("*") < 0: #not a pro new_node = parse_analysis.Node() new_node.copy_from(node) new_node.index = index_without_null doc.index2real[sentence_num][node.index] = new_node.index index_without_null += 1 all_words_wl.append(new_node) else: doc.index2real[sentence_num][ node.index] = index_without_null if node.word == "*pro*": doc.add_zp(sentence_num, doc.index2real[sentence_num][node.index]) doc.nodes[sentence_num] = wl doc.filter_nodes[sentence_num] = all_words_wl for node in nl: if is_np(node.tag): if node.parent.tag.startswith("NP"): if not (node == node.parent.child[0]): continue leaf_nodes = node.get_leaf() if is_zero_tag(leaf_nodes): continue doc.add_np( sentence_num, doc.index2real[sentence_num][leaf_nodes[0].index], doc.index2real[sentence_num][leaf_nodes[-1].index]) elif line.startswith("Coreference chain"): first = True res_info = None last_index = 0 antecedents = [] while True: inline = f.readline() if not inline: break if inline.startswith( "----------------------------------------------------------------------------------" ): break inline = inline.strip() if len(inline) <= 0: continue if inline.startswith("Chain"): first = True res_info = None last_index = 0 antecedents = [] coref_id = inline.strip().split(" ")[1] else: match = pattern_zp.match(inline) if match: sentence_index = int(match.groups()[0]) begin_word_index = int(match.groups()[1]) end_word_index = int(match.groups()[2]) word = match.groups()[-1] if word == "*pro*": is_azp = False if not first: is_azp = True if doc.zp_dict.has_key( (sentence_index, doc.index2real[sentence_index] [begin_word_index])): this_zp = doc.zp_dict[( sentence_index, doc.index2real[sentence_index] [begin_word_index])] this_zp.set_azp(coref_id) this_zp.set_antecedent(antecedents) if not word == "*pro*": first = False res_info = inline last_index = sentence_index if doc.np_dict.has_key( ((sentence_index, doc.index2real[sentence_index] [begin_word_index], doc.index2real[sentence_index] [end_word_index]))): this_np = doc.np_dict[( sentence_index, doc.index2real[sentence_index] [begin_word_index], doc.index2real[sentence_index] [end_word_index])] this_np.coref_id = coref_id antecedents.append(this_np) if not inline: break doc.update() return doc
def get_info_from_file(file_name): pattern = re.compile("(\d+?)\ +(.+?)$") pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$") total = 0 inline = "new" f = open(file_name, encoding='utf-8') sentence_num = 0 nodes_info = {} candi = {} zps = [] azps = [] while True: line = f.readline() if not line: break line = line.strip() if line == "Leaves:": while True: inline = f.readline() if inline.strip() == "": break inline = inline.strip() match = pattern.match(inline) if match: word = match.groups()[1] sentence_num += 1 elif line == "Tree:": candi[sentence_num] = [] nodes_info[sentence_num] = None parse_info = "" inline = f.readline() while True: inline = f.readline() if inline.strip("\n") == "": break parse_info = parse_info + " " + inline.strip() parse_info = parse_info.strip() nl, wl = parse_analysis.buildTree(parse_info) nodes_info[sentence_num] = (nl, wl) for node in nl: if is_np(node.tag): if node.parent.tag.startswith("NP"): if not (node == node.parent.child[0]): continue leaf_nodes = node.get_leaf() if is_pro(leaf_nodes): continue if is_zero_tag(leaf_nodes): continue candi[sentence_num].append( (leaf_nodes[0].index, leaf_nodes[-1].index)) total += 1 for node in wl: if node.word == "*pro*": zps.append((sentence_num, node.index)) elif line.startswith("Coreference chain"): first = True antecedents = [] while True: inline = f.readline() if not inline: break if inline.startswith( "----------------------------------------------------------------------------------" ): break inline = inline.strip() if len(inline) <= 0: continue if inline.startswith("Chain"): first = True antecedents = [] coref_id = inline.strip().split(" ")[1] else: match = pattern_zp.match(inline) if match: sentence_index = int(match.groups()[0]) begin_word_index = int(match.groups()[1]) end_word_index = int(match.groups()[2]) word = match.groups()[-1] if word == "*pro*": if not first: azps.append( (sentence_index, begin_word_index, end_word_index, antecedents, coref_id, 1)) if not word == "*pro*": first = False antecedents.append( (sentence_index, begin_word_index, end_word_index, coref_id)) # if not first: # azps.append((sentence_index, begin_word_index, end_word_index, antecedents, coref_id, 0)) if not inline: break return zps, azps, candi, nodes_info
def get_info_from_file_system(file_name, parser_in, parser_out, MAX=2): pattern = re.compile("(\d+?)\ +(.+?)$") pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$") total = 0 inline = "new" f = open(file_name) sentence_num = 0 ''' ################################################################################ # nodes_info: (dict) 存放着对应sentence_index下的每个sentence的 nl 和 wl # # ------------- nodes_info[sentence_index] = (nl,wl) # # candi: (dict) 存放着sentence_index下的每个candidate # # ------------- candi[sentence_index] = list of (begin_index,end_index) # # zps: (list) 存放着对应file下的每个zp # # ------------- item : (sentence_index,zp_index) # azps: (list) 存放着对应file下的每个azp # # ------------- 每个item 对应着 (sentence_index,zp_index,antecedents=[],is_azp) # ------------- antecedents - (sentence_index,begin_word_index,end_word_index) ################################################################################ ''' nodes_info = {} candi = {} zps = [] azps = [] while True: line = f.readline() if not line: break line = line.strip() if line == "Leaves:": while True: inline = f.readline() if inline.strip() == "": break inline = inline.strip() match = pattern.match(inline) if match: word = match.groups()[1] #if word == "*pro*": # print word #if word.find("*") < 0: # print word sentence_num += 1 elif line == "Tree:": candi[sentence_num] = [] nodes_info[sentence_num] = None parse_info = "" inline = f.readline() while True: inline = f.readline() if inline.strip("\n") == "": break parse_info = parse_info + " " + inline.strip() parse_info = parse_info.strip() nl, wl = parse_analysis.buildTree(parse_info) pw = [] for word in wl: pw.append(word.word) parser_in.write(" ".join(pw) + "\n") parse_info = parser_out.readline().strip() parse_info = "(TOP" + parse_info[1:-1] + ")" nl, wl = parse_analysis.buildTree(parse_info) nodes_info[sentence_num] = (nl, wl) for node in nl: if (node.tag.find("NP") >= 0) and (node.tag.find("DNP") < 0): if (node.tag.find("NP") >= 0) and (node.tag.find("DNP") < 0): if not (node == node.parent.child[0]): continue leaf_nodes = node.get_leaf() if is_pro(leaf_nodes): continue candi[sentence_num].append( (leaf_nodes[0].index, leaf_nodes[-1].index)) total += 1 for node in wl: if node.word == "*pro*": zps.append((sentence_num, node.index)) elif line.startswith("Coreference chain"): first = True res_info = None last_index = 0 antecedents = [] while True: inline = f.readline() if not inline: break if inline.startswith( "----------------------------------------------------------------------------------" ): break inline = inline.strip() if len(inline) <= 0: continue if inline.startswith("Chain"): first = True res_info = None last_index = 0 antecedents = [] else: match = pattern_zp.match(inline) if match: sentence_index = int(match.groups()[0]) begin_word_index = int(match.groups()[1]) end_word_index = int(match.groups()[2]) word = match.groups()[-1] ################################## ## Extract Features Here ! ## ################################## if word == "*pro*": is_azp = False if not first: is_azp = True azps.append((sentence_index, begin_word_index, antecedents, is_azp)) ''' if word == "*pro*" and (not first): #print file_name,inline,res_info print >> sys.stderr, file_name,inline,res_info #print sentence_index,last_index if (sentence_index - last_index) <= MAX: #print sentence_index,last_index if len(antecedents) >= 1: si,bi,ei = antecedents[-1] if (bi,ei) in candi[si]: print bi,ei ''' if not word == "*pro*": first = False res_info = inline last_index = sentence_index antecedents.append( (sentence_index, begin_word_index, end_word_index)) if not inline: break return zps, azps, candi, nodes_info