Пример #1
0
def get_info_from_file(file_name,MAX=2):

    pattern = re.compile("(\d+?)\ +(.+?)$")
    pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$")

    total = 0

    inline = "new"
    f = open(file_name)
    print("-----------------------------------------------------------------------------------------------------------------------------")
    print(file_name)
    old_2_new_id={}
    sentence_num = 0

    nodes_info = {}   
    candi = {}
    zps = []
    azps = []

    while True:
        line = f.readline()
        if not line:
            break
        line = line.strip()

        if line == "Leaves:":
            word_old_id=0
            del_num=0
            while True:
                inline = f.readline()
                if inline.strip() == "":break
                inline = inline.strip()
                match = pattern.match(inline)
                if match:
                    word = match.groups()[1]
                    if word == '[MASK]':#mjj,12.16
                        del_num+=1
                    old_2_new_id[str(sentence_num) + "_" + str(word_old_id)] =word_old_id-del_num
                    word_old_id+=1
            sentence_num += 1
    
        elif line == "Tree:":
            candi[sentence_num] = []
            nodes_info[sentence_num] = None
            parse_info = ""
            inline = f.readline()
            while True:
                inline = f.readline()
                if inline.strip("\n") == "":break
                if inline.strip()=="":continue
                parse_info = parse_info + " " + inline.strip()    
            parse_info = parse_info.strip() #a tree each line
            print(parse_info)
            parse_info = Tree.parse(parse_info, lower=False)#mjj.12.16,return a tree
            print(str(parse_info))
            nl,wl = parse_analysis.buildTree(str(parse_info))

            nodes_info[sentence_num] = (nl,wl)

            for node in nl:
                if is_np(node.tag):
                    if node.parent.tag.startswith("NP"):
                        if not (node == node.parent.child[0]):
                            continue
                    leaf_nodes = node.get_leaf()
                    if is_pro(leaf_nodes):
                        continue
                    if is_zero_tag(leaf_nodes):
                        continue
                    candi[sentence_num].append((leaf_nodes[0].index,leaf_nodes[-1].index))
                    total += 1
            for node in wl:
                if node.word == "*pro*":
                    zps.append((sentence_num,node.index))  
 
        elif line.startswith("Coreference chain"):
            first = True
            res_info = None
            last_index = 0
            antecedents = []

            while True:
                inline = f.readline()
                if not inline:break
                if inline.startswith("----------------------------------------------------------------------------------"):
                    break
                inline = inline.strip()
                if len(inline) <= 0:continue
                if inline.startswith("Chain"):
                    first = True
                    res_info = None
                    last_index = 0
                    antecedents = []
                    coref_id = inline.strip().split(" ")[1]
                else:
                    match = pattern_zp.match(inline)
                    if match:
                        sentence_index = int(match.groups()[0])
                        begin_word_index = int(match.groups()[1])#mjj,12.16
                        begin_word_index=old_2_new_id[str(sentence_index)+"_"+str(begin_word_index)]
                        end_word_index = int(match.groups()[2])
                        end_word_index = old_2_new_id[str(sentence_index) +"_"+ str(end_word_index)]
                        word = match.groups()[-1]
                        if word == "*pro*":
                            is_azp = False
                            if not first:
                                is_azp = True
                                azps.append((sentence_index,begin_word_index,antecedents,coref_id))
                        if not word == "*pro*":
                            first = False
                            res_info = inline
                            last_index = sentence_index
                            antecedents.append((sentence_index,begin_word_index,end_word_index,coref_id))
        
        if not inline:
            break
    return zps,azps,candi,nodes_info
Пример #2
0
def get_info_from_file(file_name, MAX=2):

    pattern = re.compile("(\d+?)\ +(.+?)$")
    pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$")

    inline = "new"
    f = open(file_name)

    doc = document.Doc()
    sentence_num = 0

    while True:
        line = f.readline()
        if not line:
            break
        line = line.strip()

        if line == "Leaves:":
            while True:
                inline = f.readline()
                if inline.strip() == "": break
                inline = inline.strip()
                match = pattern.match(inline)
                if match:
                    word = match.groups()[1]
            sentence_num += 1

        elif line == "Tree:":
            doc.init_sentence(sentence_num)
            parse_info = ""
            inline = f.readline()
            while True:
                inline = f.readline()
                if inline.strip("\n") == "": break
                parse_info = parse_info + " " + inline.strip()
            parse_info = parse_info.strip()
            nl, wl = parse_analysis.buildTree(parse_info)

            index_without_null = 0
            all_words_wl = []
            for node in wl:
                if node.word.find("*") < 0:  #not a pro
                    new_node = parse_analysis.Node()
                    new_node.copy_from(node)
                    new_node.index = index_without_null
                    doc.index2real[sentence_num][node.index] = new_node.index
                    index_without_null += 1
                    all_words_wl.append(new_node)
                else:
                    doc.index2real[sentence_num][
                        node.index] = index_without_null

                if node.word == "*pro*":
                    doc.add_zp(sentence_num,
                               doc.index2real[sentence_num][node.index])
            doc.nodes[sentence_num] = wl
            doc.filter_nodes[sentence_num] = all_words_wl

            for node in nl:
                if is_np(node.tag):
                    if node.parent.tag.startswith("NP"):
                        if not (node == node.parent.child[0]):
                            continue
                    leaf_nodes = node.get_leaf()
                    if is_zero_tag(leaf_nodes):
                        continue
                    doc.add_np(
                        sentence_num,
                        doc.index2real[sentence_num][leaf_nodes[0].index],
                        doc.index2real[sentence_num][leaf_nodes[-1].index])
        elif line.startswith("Coreference chain"):
            first = True
            res_info = None
            last_index = 0
            antecedents = []

            while True:
                inline = f.readline()
                if not inline: break
                if inline.startswith(
                        "----------------------------------------------------------------------------------"
                ):
                    break
                inline = inline.strip()
                if len(inline) <= 0: continue
                if inline.startswith("Chain"):
                    first = True
                    res_info = None
                    last_index = 0
                    antecedents = []
                    coref_id = inline.strip().split(" ")[1]
                else:
                    match = pattern_zp.match(inline)
                    if match:
                        sentence_index = int(match.groups()[0])
                        begin_word_index = int(match.groups()[1])
                        end_word_index = int(match.groups()[2])
                        word = match.groups()[-1]
                        if word == "*pro*":
                            is_azp = False
                            if not first:
                                is_azp = True
                                if doc.zp_dict.has_key(
                                    (sentence_index,
                                     doc.index2real[sentence_index]
                                     [begin_word_index])):
                                    this_zp = doc.zp_dict[(
                                        sentence_index,
                                        doc.index2real[sentence_index]
                                        [begin_word_index])]
                                    this_zp.set_azp(coref_id)
                                    this_zp.set_antecedent(antecedents)
                        if not word == "*pro*":
                            first = False
                            res_info = inline
                            last_index = sentence_index
                            if doc.np_dict.has_key(
                                ((sentence_index,
                                  doc.index2real[sentence_index]
                                  [begin_word_index],
                                  doc.index2real[sentence_index]
                                  [end_word_index]))):
                                this_np = doc.np_dict[(
                                    sentence_index,
                                    doc.index2real[sentence_index]
                                    [begin_word_index],
                                    doc.index2real[sentence_index]
                                    [end_word_index])]
                                this_np.coref_id = coref_id
                                antecedents.append(this_np)

        if not inline:
            break
    doc.update()
    return doc
def get_info_from_file(file_name):
    pattern = re.compile("(\d+?)\ +(.+?)$")
    pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$")

    total = 0

    inline = "new"
    f = open(file_name, encoding='utf-8')

    sentence_num = 0

    nodes_info = {}
    candi = {}
    zps = []
    azps = []

    while True:
        line = f.readline()
        if not line:
            break
        line = line.strip()

        if line == "Leaves:":
            while True:
                inline = f.readline()
                if inline.strip() == "":
                    break
                inline = inline.strip()
                match = pattern.match(inline)
                if match:
                    word = match.groups()[1]
            sentence_num += 1

        elif line == "Tree:":
            candi[sentence_num] = []
            nodes_info[sentence_num] = None
            parse_info = ""
            inline = f.readline()
            while True:
                inline = f.readline()
                if inline.strip("\n") == "":
                    break
                parse_info = parse_info + " " + inline.strip()
            parse_info = parse_info.strip()
            nl, wl = parse_analysis.buildTree(parse_info)

            nodes_info[sentence_num] = (nl, wl)

            for node in nl:
                if is_np(node.tag):
                    if node.parent.tag.startswith("NP"):
                        if not (node == node.parent.child[0]):
                            continue
                    leaf_nodes = node.get_leaf()
                    if is_pro(leaf_nodes):
                        continue
                    if is_zero_tag(leaf_nodes):
                        continue
                    candi[sentence_num].append(
                        (leaf_nodes[0].index, leaf_nodes[-1].index))
                    total += 1
            for node in wl:
                if node.word == "*pro*":
                    zps.append((sentence_num, node.index))

        elif line.startswith("Coreference chain"):
            first = True
            antecedents = []

            while True:
                inline = f.readline()
                if not inline: break
                if inline.startswith(
                        "----------------------------------------------------------------------------------"
                ):
                    break
                inline = inline.strip()
                if len(inline) <= 0: continue
                if inline.startswith("Chain"):
                    first = True
                    antecedents = []
                    coref_id = inline.strip().split(" ")[1]
                else:
                    match = pattern_zp.match(inline)
                    if match:
                        sentence_index = int(match.groups()[0])
                        begin_word_index = int(match.groups()[1])
                        end_word_index = int(match.groups()[2])
                        word = match.groups()[-1]
                        if word == "*pro*":
                            if not first:
                                azps.append(
                                    (sentence_index, begin_word_index,
                                     end_word_index, antecedents, coref_id, 1))
                        if not word == "*pro*":
                            first = False
                            antecedents.append(
                                (sentence_index, begin_word_index,
                                 end_word_index, coref_id))
                            # if not first:
                            #     azps.append((sentence_index, begin_word_index, end_word_index, antecedents, coref_id, 0))

        if not inline:
            break
    return zps, azps, candi, nodes_info
Пример #4
0
def get_info_from_file_system(file_name, parser_in, parser_out, MAX=2):

    pattern = re.compile("(\d+?)\ +(.+?)$")
    pattern_zp = re.compile("(\d+?)\.(\d+?)\-(\d+?)\ +(.+?)$")

    total = 0

    inline = "new"
    f = open(file_name)

    sentence_num = 0
    '''
    ################################################################################
    # nodes_info: (dict) 存放着对应sentence_index下的每个sentence的 nl 和 wl #
    #    ------------- nodes_info[sentence_index] = (nl,wl)                   #
    # candi: (dict) 存放着sentence_index下的每个candidate                          #
    #    ------------- candi[sentence_index] = list of (begin_index,end_index)      #
    # zps:  (list)  存放着对应file下的每个zp                                       #
    #    ------------- item : (sentence_index,zp_index)
    # azps:  (list)  存放着对应file下的每个azp                                       #
    #    ------------- 每个item 对应着 (sentence_index,zp_index,antecedents=[],is_azp)
    #   -------------  antecedents - (sentence_index,begin_word_index,end_word_index)
    ################################################################################
    '''
    nodes_info = {}
    candi = {}
    zps = []
    azps = []

    while True:
        line = f.readline()
        if not line:
            break
        line = line.strip()

        if line == "Leaves:":
            while True:
                inline = f.readline()
                if inline.strip() == "": break
                inline = inline.strip()
                match = pattern.match(inline)
                if match:
                    word = match.groups()[1]
                    #if word == "*pro*":
                    #    print word
                    #if word.find("*") < 0:
                    #    print word
            sentence_num += 1

        elif line == "Tree:":
            candi[sentence_num] = []
            nodes_info[sentence_num] = None
            parse_info = ""
            inline = f.readline()
            while True:
                inline = f.readline()
                if inline.strip("\n") == "": break
                parse_info = parse_info + " " + inline.strip()
            parse_info = parse_info.strip()
            nl, wl = parse_analysis.buildTree(parse_info)

            pw = []
            for word in wl:
                pw.append(word.word)

            parser_in.write(" ".join(pw) + "\n")
            parse_info = parser_out.readline().strip()
            parse_info = "(TOP" + parse_info[1:-1] + ")"
            nl, wl = parse_analysis.buildTree(parse_info)

            nodes_info[sentence_num] = (nl, wl)

            for node in nl:
                if (node.tag.find("NP") >= 0) and (node.tag.find("DNP") < 0):
                    if (node.tag.find("NP") >= 0) and (node.tag.find("DNP") <
                                                       0):
                        if not (node == node.parent.child[0]):
                            continue
                    leaf_nodes = node.get_leaf()
                    if is_pro(leaf_nodes):
                        continue

                    candi[sentence_num].append(
                        (leaf_nodes[0].index, leaf_nodes[-1].index))
                    total += 1
            for node in wl:
                if node.word == "*pro*":
                    zps.append((sentence_num, node.index))

        elif line.startswith("Coreference chain"):
            first = True
            res_info = None
            last_index = 0
            antecedents = []

            while True:
                inline = f.readline()
                if not inline: break
                if inline.startswith(
                        "----------------------------------------------------------------------------------"
                ):
                    break
                inline = inline.strip()
                if len(inline) <= 0: continue
                if inline.startswith("Chain"):
                    first = True
                    res_info = None
                    last_index = 0
                    antecedents = []
                else:
                    match = pattern_zp.match(inline)
                    if match:
                        sentence_index = int(match.groups()[0])
                        begin_word_index = int(match.groups()[1])
                        end_word_index = int(match.groups()[2])
                        word = match.groups()[-1]

                        ##################################
                        ##    Extract Features Here !   ##
                        ##################################

                        if word == "*pro*":
                            is_azp = False
                            if not first:
                                is_azp = True
                                azps.append((sentence_index, begin_word_index,
                                             antecedents, is_azp))
                        '''
                        if word == "*pro*" and (not first):
                            #print file_name,inline,res_info
                            print >> sys.stderr, file_name,inline,res_info
                            #print sentence_index,last_index
                            if (sentence_index - last_index) <= MAX:
                                #print sentence_index,last_index
                                if len(antecedents) >= 1:
                                    si,bi,ei = antecedents[-1]
                                    if (bi,ei) in candi[si]:
                                        print bi,ei
                        '''
                        if not word == "*pro*":
                            first = False
                            res_info = inline
                            last_index = sentence_index
                            antecedents.append(
                                (sentence_index, begin_word_index,
                                 end_word_index))

        if not inline:
            break
    return zps, azps, candi, nodes_info