示例#1
0
def create_english_facts(parse, rawFile, tmpSentPath):
    relation_df = create_english_dataframe(parse)
    [wid_word_list, punctlist,
     wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile)

    #     print(relation_df)
    [wid_pid, p_w, wid_pos_list, wid_rel_list
     ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(),
                                 relation_df['WORD'].tolist(),
                                 relation_df['POS'].tolist(),
                                 relation_df['RELATION'].tolist())

    #     print(wid_pid)
    writeFact.add(wid_pid, "E_wid-pid",
                  tmpSentPath + "/E_word_id_parser_id_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/E_word_id_parser_id_mapping.dat")
    writeFact.addLists(
        [relation_df['PID'].tolist(), relation_df['WORD'].tolist()],
        "E_pid-word", tmpSentPath + "/E_parser_id_word_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/E_parser_id_word_mapping.dat")
    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "E_pos1-relation-pid1-word1-pid2", tmpSentPath + "/E_conll_facts.dat")
    #writeFact.debug_check(tmpSentPath+"/E_conll_facts.dat")

    relation_df.PID = relation_df.PID.replace(p_w)
    relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w)
    #     print(relation_df)
    relation_df = relation_df[~relation_df["PID"].astype(str).str.
                              startswith('P', na=False)]
    relation_df = relation_df[~relation_df["RELATION"].astype(str).str.
                              startswith('punct', na=False)]
    #relation_df = relation_df[~relation_df["POS"].astype(str).str.startswith('PUNCT', na=False)]
    #     print(relation_df)

    modified_pidwith = relation_df['PIDWITH'].tolist()
    modified_pid = relation_df['PID'].tolist()
    word_pidwith = [wid_word_dict[k]
                    for k in modified_pidwith]  #was working in python2
    word_pid = [wid_word_dict[k]
                for k in modified_pid]  #was working in python2
    #     print(relation_df)

    writeFact.addLists([
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['POS'].tolist(), relation_df['POS_STANFORD'].tolist(),
        relation_df['RELATION'].tolist(), relation_df['PIDWITH'].tolist(),
        word_pidwith
    ], "E_pos1-pos_std1-relation-cwid-cword-hwid-hword",
                       tmpSentPath + "/E_parse.dat")
    #     writeFact.debug_check(tmpSentPath+"/E_parse.dat")
    cid_hid = extractUnlabelledDependency(relation_df)
    #     print(cid_hid)
    which_language = 'E'
    tree(relation_df, wid_word_list, cid_hid, wid_pos_list, wid_rel_list,
         which_language, tmpSentPath, rawFile)
def create_english_facts(parse, wid_word_list):
    relation_df = create_english_dataframe(parse)

    [wid_pid, p_w,
     wid_pos_list] = writeFact.createWID_PID(wid_word_list,
                                             relation_df['PID'].tolist(),
                                             relation_df['WORD'].tolist(),
                                             relation_df['POS'].tolist())

    writeFact.add(wid_pid, "E_wid-pid",
                  tmpSentPath + "/E_word_id_parser_id_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/E_word_id_parser_id_mapping.dat")
    writeFact.addLists(
        [relation_df['PID'].tolist(), relation_df['WORD'].tolist()],
        "E_pid-word", tmpSentPath + "/E_parser_id_word_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/E_parser_id_word_mapping.dat")
    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "E_pos1-relation-pid1-word1-pid2", tmpSentPath + "/E_conll_facts.dat")
    #writeFact.debug_check(tmpSentPath+"/E_conll_facts.dat")

    relation_df.PID = relation_df.PID.replace(p_w)
    relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w)

    relation_df = relation_df[~relation_df["PID"].str.startswith('P', na=False
                                                                 )]

    modified_pidwith = relation_df['PIDWITH'].tolist()
    modified_pid = relation_df['PID'].tolist()
    word_pidwith = [wid_word_dict[k]
                    for k in modified_pidwith]  #was working in python2
    word_pid = [wid_word_dict[k]
                for k in modified_pid]  #was working in python2

    writeFact.addLists([
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['POS'].tolist(), relation_df['POS_STANFORD'].tolist(),
        relation_df['RELATION'].tolist(), relation_df['PIDWITH'].tolist(),
        word_pidwith
    ], "E_pos1-pos_std1-relation-cwid-cword-hwid-hword",
                       tmpSentPath + "/E_parse.dat")
    #     writeFact.debug_check(tmpSentPath+"/E_parse.dat")
    cid_hid = extractUnlabelledDependency(relation_df)
def create_hindi_facts(parse, wid_word_list, tmpSentPath):
    relation_df = create_hindi_dataframe(parse)

    [wid_pid,p_w, wid_pos_list]=writeFact.createWID_PID(wid_word_list,\
            relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist())

    writeFact.add(wid_pid, "H_wid-pid",
                  tmpSentPath + "/H_word_id_parser_id_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/H_word_id_parser_id_mapping.dat")
    writeFact.addLists(
        [relation_df['PID'].tolist(), relation_df['WORD'].tolist()],
        "H_pid-word", tmpSentPath + "/H_parser_id_word_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/H_parser_id_word_mapping.dat")
    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat")
    #writeFact.debug_check(tmpSentPath+"/H_conll_facts.dat")

    relation_df.PID = relation_df.PID.replace(p_w)
    relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w)
    #     print(relation_df)

    relation_df = relation_df[~relation_df["PID"].str.startswith('P', na=False
                                                                 )]

    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "H_pos1-relation-cid-word1-hid", tmpSentPath + "/H_parse.dat")
    tam_lwg = writeFact.extract_tam_lwg_ids()

    cid_hid = extractUnlabelledDependency(relation_df)
    #print(cid_hid)
    #tree(relation_df, wid_word_list, cid_hid, wid_pos_list)
    #dff = for_anand(rawFile, relation_df)

    return (relation_df)
示例#4
0
     wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile)
    item2WriteInFacts, def_lwg_item, all_vib_ids = writeFact.lwg_of_postprocessors(
        wid_word_list, vibhaktis)
    relation_df = AnuLibrary.create_hindi_dataframe(parse)

    #     print(relation_df)
    #     print(wid_word_list,relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist(), relation_df['RELATION'])
    [wid_pid, p_w, wid_pos_list, wid_rel_list
     ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(),
                                 relation_df['WORD'].tolist(),
                                 relation_df['POS'].tolist(),
                                 relation_df['RELATION'].tolist())

    writeFact.add(wid_pid, "H_wid-pid", tmpSentPath + "/H_wid-pid")
    writeFact.addLists(
        [relation_df['PID'].tolist(), relation_df['WORD'].tolist()],
        "H_pid-word", tmpSentPath + "/H_pid-wid.dat")
    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat")
    #     print(p_w)

    relation_df = writeFact.convertPIDsToWIDs(relation_df)
    #     print(relation_df)

    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
示例#5
0
def create_hindi_facts(parse, rawFile, tmpSentPath, alignment_path):
    with open(alignment_path + "/vibhakti", "r") as f:
        vibhaktis = f.read().splitlines()
    [wid_word_list, punctlist,
     wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile)
    item2WriteInFacts, def_lwg_item, all_vib_ids = writeFact.lwg_of_postprocessors(
        wid_word_list, vibhaktis)
    relation_df = create_hindi_dataframe(parse)
    #print(relation_df)
    [wid_pid, p_w, wid_pos_list, wid_rel_list
     ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(),
                                 relation_df['WORD'].tolist(),
                                 relation_df['POS'].tolist(),
                                 relation_df['RELATION'].tolist())
    #     print(wid_pid)

    writeFact.add(wid_pid, "H_wid-pid",
                  tmpSentPath + "/H_word_id_parser_id_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/H_word_id_parser_id_mapping.dat")
    writeFact.addLists(
        [relation_df['PID'].tolist(), relation_df['WORD'].tolist()],
        "H_pid-word", tmpSentPath + "/H_parser_id_word_mapping.dat")
    #writeFact.debug_check(tmpSentPath+"/H_parser_id_word_mapping.dat")
    writeFact.addLists([
        relation_df['POS'].tolist(), relation_df['RELATION'].tolist(),
        relation_df['PID'].tolist(), relation_df['WORD'].tolist(),
        relation_df['PIDWITH'].tolist()
    ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat")
    #writeFact.debug_check(tmpSentPath+"/H_conll_facts.dat")

    relation_df.PID = relation_df.PID.replace(p_w)
    relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w)
    #     print(relation_df)

    relation_df = relation_df[~relation_df["PID"].astype(str).str.
                              startswith('P', na=False)]
    relation_df = relation_df[~relation_df["RELATION"].astype(str).str.
                              startswith('punct', na=False)]
    #relation_df = relation_df[~relation_df["POS"].astype(str).str.startswith('PUNCT', na=False)]

    #    writeFact.addLists([relation_df['POS'].tolist(),relation_df['RELATION'].tolist(),relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['PIDWITH'].tolist()],"H_pos1-relation-cid-word1-hid",tmpSentPath+"/H_parse.dat")
    #     tam_lwg = writeFact.extract_tam_lwg_ids()

    cid_hid = extractUnlabelledDependency(relation_df)
    #print(cid_hid)

    #     display(relation_df)
    which_language = 'H'
    #tree(relation_df, wid_word_list, cid_hid, wid_pos_list, wid_rel_list, which_language, tmpSentPath, rawFile)
    #     dff = for_anand(rawFile, relation_df) ; #     return(dff) ;#     checkLwgParseAgainstDefiniteLWG(relation_df,def_lwg_item,tam_lwg, wid_word_list, cid_hid)
    cid = relation_df['PID'].tolist()
    hid = relation_df['PIDWITH'].tolist()
    sub_tree = {}

    for h, c in zip(hid, cid):

        if h in sub_tree:
            sub_tree[h].append(c)
        else:
            sub_tree[h] = [c]

    #for h,c in zip(hid, cid):

    #if str(h) in sub_tree:
    #   sub_tree[str(h)].append(str(c))
    #else:
    #   sub_tree[str(h)] = [str(c)]

    return ([
        relation_df, wid_word_list, punctlist, wid_word_dict,
        item2WriteInFacts, def_lwg_item, all_vib_ids, wid_pid, p_w,
        wid_pos_list, wid_rel_list, cid_hid, sub_tree
    ])