예제 #1
0
def triples_extraction(path):
    processed_sentences = []
    sentence = []

    for line in codecs.open(path, 'r', 'utf-8'):
        if len(line) == 1:
            processed_sentences.append(sentence)
            sentence = []
        else:
            word = line.split("\t")
            sentence.append(word)

    deps = []
    for sentence in processed_sentences:
        s = u""
        for line in sentence:
            s += u"\t".join(line) + u'\n'
        deps.append(s)

    triples = []
    for sent_dep in deps:
        try:
            graph = DependencyGraph(tree_str=sent_dep)
        except:
            pass
        try:
            res = extract(graph.triples())
            triples.append(res)
        except:
            pass

    return (triples)
예제 #2
0
 def draw(self):
     # 绘制词法树
     par_result = ''
     for node in self.tree:
         par_result += "\t" + node[0] + "\t" + 'null' + "\t" + str(
             node[1]) + "\n"
     conlltree = DependencyGraph(par_result)
     tree = conlltree.tree()  # 构建树结构
     tree.draw()  # 显示输出的树
예제 #3
0
 def build_dep_graph(self):
     if self._check():
         return None
     par_result = ''
     for i in range(len(self.words)):
         if self.arcs[i].head == 0:
             pass
             # self.arcs[i].relation = "ROOT"
         par_result += "\t" + self.words[i] + "(" + self.arcs[i].relation + ")" + "\t" + self.postags[
             i] + "\t" + str(self.arcs[i].head) + "\t" + self.arcs[i].relation + "\n"
     # print(par_result)
     conlltree = DependencyGraph(par_result)  # 转换为依存句法图
     tree = conlltree.tree()  # 构建树结构
     tree.draw()  # 显示输出的树
예제 #4
0
def read_syntaxnet_output(sentences):

    # joint all sentences into a single string with
    # separating new lines
    all_sentences = "\n".join(sentences)

    # redirect std_error to /dev/null
    FNULL = open(os.devnull, 'w')

    process = subprocess.Popen(
        'MODEL_DIRECTORY=/Users/dbatista/Downloads/Portuguese; '
        'cd /Users/dbatista/models/syntaxnet; '
        'echo \'%s\' | syntaxnet/models/parsey_universal/parse.sh '
        '$MODEL_DIRECTORY 2' % all_sentences,
        shell=True,
        universal_newlines=False,
        stdout=subprocess.PIPE,
        stderr=FNULL)

    output = process.communicate()
    processed_sentences = []
    sentence = []

    for line in output[0].split("\n"):
        if len(line) == 0:
            processed_sentences.append(sentence)
            sentence = []
        else:
            word = line.split("\t")
            sentence.append(word)

    # subprocess captures an empty new line
    del processed_sentences[-1]

    deps = []
    for sentence in processed_sentences:
        s = ''
        for line in sentence:
            s += "\t".join(line) + '\n'
        deps.append(s)

    for sent_dep in deps:
        graph = DependencyGraph(tree_str=sent_dep.decode("utf8"))
        print "triples"
        for triple in graph.triples():
            print triple
        print
        tree = graph.tree()
        tree.pretty_print()
예제 #5
0
def combined_sentence_lemma(sent1_conllu,
                           sent2_conllu,
                           sent1_id,
                           sent2_id):
    sent1_dict = DependencyGraph(sent1_conllu, 
                                top_relation_label='root').nodes
    sent2_dict = DependencyGraph(sent2_conllu, 
                                top_relation_label='root').nodes
    
    if sent1_id==sent2_id:
        return extract_node_info(sent1_dict, info="lemma")
    elif sent1_id < sent2_id:
        return extract_node_info(sent1_dict, info="lemma") + \
                extract_node_info(sent2_dict, info="lemma")
    else:
        return extract_node_info(sent2_dict, info="lemma") + \
                extract_node_info(sent1_dict, info="lemma")
예제 #6
0
 def gold_standard_tree(self, sent: DependencyGraph):
     """
     return the arcs of the gold standard tree for the given sentence
     """
     root = sent.root["address"]
     edges = sent.nx_graph().edges
     # build the gold standard tree, edge weights are
     # of no significance and therefore are all set to 1
     return [(edge[1], edge[0]) for edge in edges] + [(0, root)]
예제 #7
0
def extract_stanza_info(row, eid_num, param="ctag"):
    '''
    Given a row in pd dataframe, and eid number (1 or 2)
    extract the stanza info from conllu parse

    param: 'ctag', 'lemma', 'word'
    '''
    sent_dict = DependencyGraph(row[f'eid{eid_num}_sent_conllu'], 
                                top_relation_label='root').nodes
    tokenid = int(getattr(row, f'eid{eid_num}_token_id'))
    
    return sent_dict[tokenid+1][param]
def get_structs(ud_path):
    files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
    structures = {}
    for file in files:
        with open(ud_path + file, 'r') as f:
            iden = 0
            a = ""
            words = []
            for line in f:
                if line != "\n":
                    a += line
                    words.append(line.split("\t")[1])
                else:
                    iden += 1
                    a = a
                    structure = DependencyGraph(a, top_relation_label='root')
                    sent = " ".join(words)
                    sent = sent
                    sent_id = file + " " + str(iden)
                    structures[sent_id] = structure
                    a = ""
                    words = []
    return structures
예제 #9
0
    html_string = re.sub(r'\)', r'&rcrb;', s)
    html_string = re.sub(r'\(', r'&lcrb;', html_string)
    return html_string


files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
structures = []
for file in files:
    with open('structures.tsv', 'a') as fout:
        with open(file, 'r') as f:
            id = 0
            a = ""
            words = []
            for line in f:
                if line != "\n":
                    a += line
                    words.append(line.split("\t")[1])
                else:
                    id += 1
                    a = html_ify(a)
                    structure = DependencyGraph(a, top_relation_label='root')
                    sent = " ".join(words)
                    sent = html_ify(sent)
                    sent_id = file + " sent_" + str(id)
                    structures.append(structure)
                    a = ""
                    words = []
                    fout.write(
                        sent_id + "\t" +
                        " ".join(str(structures[-1].tree()).splitlines()) +
                        "\t" + sent + "\n")