Exemplo n.º 1
0
def verb_mining():
    text = read_cabocha('neko.txt.cabocha')
    fout = codecs.open('mining.txt', 'w', 'utf-8')
    for sentence in text:
        # if を is in chunk, wo_pos represents the index of the char
        for wo_pos, chunk in enumerate(sentence):
            for morph in chunk.morphs:
                if morph.pos1 == u'サ変接続' and chunk.case() == u'を':
                    verb = sentence[chunk.dst].first_verb()
                    if verb:
                        # sentence[chunk.dst] must has verb
                        line = chunk.surface() + verb + u'\t'
                        cases = []
                        surfaces = []
                        for src in (chunk.srcs + sentence[chunk.dst].srcs):
                            if src == wo_pos:
                                continue
                            case = sentence[src].case()
                            if case:
                                cases.append(case)
                                surfaces.append(sentence[src].surface())
                        if cases:
                            line = line + u' '.join(
                                sorted(cases)) + u'\t' + u' '.join(
                                    sorted(surfaces,
                                           key=lambda surface: surface[-1]))
                            fout.write(line + u'\n')
    fout.close()
Exemplo n.º 2
0
def relations():
    fout = codecs.open('out42.txt', 'w', 'utf-8')
    text = read_cabocha('neko.txt.cabocha')
    for sentence in text:
        for chunk in sentence:
            line = chunk.surface()
            for src in chunk.srcs:
                if sentence[src].morphs:
                    line = line + u'\t' + sentence[src].surface()
            line = line + '\n'
            fout.write(line)
    fout.close()
Exemplo n.º 3
0
def noun_to_noun_path():
    text = read_cabocha('neko.txt.cabocha')
    fout = codecs.open('n2n.txt', 'w', 'utf-8')
    for sentence in text:
        for i, chunk1 in enumerate(list(sentence)):
            for j, chunk2 in enumerate(list(sentence[i + 1:]), i + 1):
                if chunk1.has_noun() and chunk2.has_noun():
                    start_chunk = Chunk([], chunk1.dst, chunk1.srcs)
                    end_chunk = Chunk([], chunk2.dst, chunk2.srcs)
                    for morph in chunk1.morphs:
                        if morph.pos == u'名詞':
                            if not u'X' in map(lambda m: m.surface,
                                               start_chunk.morphs):
                                start_chunk.morphs.append(
                                    Morph(u'X', u'X', u'名詞', u''))
                        else:
                            start_chunk.morphs.append(morph)
                    for morph in chunk2.morphs:
                        if morph.pos == u'名詞':
                            if not 'Y' in map(lambda m: m.surface,
                                              end_chunk.morphs):
                                end_chunk.morphs.append(
                                    Morph(u'Y', u'Y', u'名詞', u''))
                        else:
                            end_chunk.morphs.append(morph)
                    path = [start_chunk]
                    while path[-1].dst != -1 and path[-1] != chunk2:
                        path.append(sentence[path[-1].dst])
                    if path[-1] == chunk2:
                        # path i->j found
                        path = path[:-1]
                        line = u' -> '.join(
                            map(lambda chunk: chunk.surface(), path))
                        line = line + ' -> Y'
                        fout.write(line + u'\n')
                    else:
                        # path i->j not found, path represents the path i->root
                        # find k, where path i->root and j->root intersects
                        pathj = [end_chunk]  # path j->root
                        while not pathj[-1].dst in map(lambda c: c.dst, path):
                            pathj.append(sentence[pathj[-1].dst])
                        k = pathj[-1].dst
                        k_in_path = 0
                        for pos, chunk in enumerate(path):
                            if chunk.dst == pathj[-1].dst:
                                k_in_path = pos + 1
                        line = ' -> '.join(
                            map(lambda c: c.surface(),
                                path[:k_in_path])) + ' | ' + ' -> '.join(
                                    map(lambda c: c.surface(), pathj)
                                ) + ' | ' + sentence[k].surface()
                        fout.write(line + u'\n')
    fout.close()
Exemplo n.º 4
0
def visualize(cabocha):
    sentences = read_cabocha(cabocha)
    g = pydot.Dot()
    for sentence in sentences:
        for chunk in sentence:
            if chunk.morphs:
                e_dst = chunk.surface()
            for src in chunk.srcs:
                if sentence[src].morphs:
                    e_src = sentence[src].surface()
                    g.add_edge(pydot.Edge(e_src, e_dst))
    
    g.write_jpeg(cabocha + '.jpg')
Exemplo n.º 5
0
def relations_nv():
    fout = codecs.open('out43.txt', 'w', 'utf-8')
    text = read_cabocha('neko.txt.cabocha')
    for sentence in text:
        for chunk in sentence:
            if chunk.has_verb():
                line = chunk.surface()
                tmp = u''
                for src in chunk.srcs:
                    if sentence[src].morphs and sentence[src].has_noun():
                        tmp = tmp + u'\t' + sentence[src].surface()
                if tmp:
                    line = line + tmp + '\n'
                    fout.write(line)
    fout.close()
Exemplo n.º 6
0
def extract_verb_patterns():
    text = read_cabocha('neko.txt.cabocha')
    fout = codecs.open('patterns.txt', 'w', 'utf-8')
    for sentence in text:
        for chunk in sentence:
            verb = chunk.first_verb()
            if verb:
                line = verb + '\t'
                cases = []
                for src in chunk.srcs:
                    case = sentence[src].case()
                    if case:
                        cases.append(case)
                if cases:
                    line = line + u' '.join(sorted(cases))
                    fout.write(line + '\n')
Exemplo n.º 7
0
def path_noun_to_root():
    text = read_cabocha('neko.txt.cabocha')
    paths = []
    for sentence in text:
        for chunk in sentence:
            path = []
            for morph in chunk.morphs:
                if morph.pos == u'名詞':
                    path.append(chunk)
            if path:
                while path[-1].dst != -1:
                    # print path[-1].surface()
                    path.append(sentence[path[-1].dst])
                if len(path) > 1:
                    paths.append(path)
    return paths
Exemplo n.º 8
0
def extract_verb_frames():
    text = read_cabocha('neko.txt.cabocha')
    fout = codecs.open('frames.txt', 'w', 'utf-8')
    for sentence in text:
        for chunk in sentence:
            verb = chunk.first_verb()
            if verb:
                line = verb + '\t'
                cases = []
                surfaces = []
                for src in chunk.srcs:
                    case = sentence[src].case()
                    if case:
                        cases.append(case)
                        surfaces.append(sentence[src].surface())
                if cases:
                    line = line + u' '.join(sorted(cases)) + '\t' + u' '.join(
                        sorted(surfaces, key=lambda surface: surface[-1]))
                    fout.write(line + '\n')
Exemplo n.º 9
0
def visualize_from_text(text, name):
    tmp = codecs.open(name + '.txt', 'w', 'utf-8')
    tmp.write(text)
    tmp.close()
    p = Popen("cabocha -f1 < " + name + ".txt > " + name + ".txt.cabocha", shell=True)
    p.wait()
    sentences = read_cabocha(name + '.txt.cabocha')
    Popen("rm " + name + ".txt", shell=True)
    Popen("rm " + name + ".txt.cabocha", shell=True)
    g = pydot.Dot()
    for sentence in sentences:
        for chunk in sentence:
            if chunk.morphs:
                e_dst = chunk.surface()
            for src in chunk.srcs:
                if sentence[src].morphs:
                    e_src = sentence[src].surface()
                    print e_src + u'->' + e_dst
                    g.add_edge(pydot.Edge(e_src, e_dst))
    g.write_jpg(name + '.jpg')