def get_pos_idx(chunks, pos): '''return indexes of chunks having the pos''' idxes = [] for i, chunk in enumerate(chunks): if search_pos(chunk, pos): idxes.append(i) return idxes
吾輩は -> 見た ここで -> 始めて -> 人間という -> ものを -> 見た 人間という -> ものを -> 見た ものを -> 見た ''' from no41 import Chunk, load_cabocha from no43 import search_pos import sys if __name__ == '__main__': infile = open(sys.argv[1], 'rt') sents = load_cabocha(infile) outfile = open(sys.argv[2], 'wt') for sent in sents: sent2 = sent for chunk in sent: noun_chunk = search_pos(chunk, pos="名詞", fmt='chunk') if noun_chunk: path_chunks = [noun_chunk] for current_chunk in sent2: if path_chunks[-1].dst == current_chunk.idx: path_chunks.append(current_chunk) if len(path_chunks) > 1: outfile.write( " -> ".join([node.surface for node in path_chunks]) + "\n") infile.close() outfile.close()
見る は を 吾輩は ものを ''' from no41 import Chunk, load_cabocha from no43 import search_pos import sys if __name__ == '__main__': infile = open(sys.argv[1], 'rt') sents = load_cabocha(infile) outfile = open(sys.argv[2], 'wt') # import pdb; pdb.set_trace() for sent in sents: sent2 = sent for chunk in sent: predicate = search_pos(chunk, "動詞", fmt="morph") if predicate: particles = [] chunk2s = [] for chunk2 in sent2: particle = search_pos(chunk2, "助詞", fmt="morph") if particle and chunk2.dst == chunk.idx: particles.append(particle) chunk2s.append(chunk2) if particles: particles = " ".join( [particle.base for particle in particles]) arguments = " ".join([c.surface for c in chunk2s]) outfile.write("{}\t{}\t{}\n".format( predicate.base, particles, arguments)) infile.close()
・コーパス中で頻出する述語と助詞パターン ''' from no41 import Chunk, load_cabocha from no43 import search_pos import sys if __name__ == '__main__': infile = open(sys.argv[1], 'rt') sents = load_cabocha(infile) outfile = open(sys.argv[2], 'wt') # import pdb; pdb.set_trace() for sent in sents: sent2 = sent for chunk in sent: # want most left verb in chunk predicate = search_pos(chunk, pos="動詞", fmt='morph') if predicate: particles = [] chunk2s = [] for chunk2 in sent2: # want most right particle in chunk particle_chunk = search_pos(chunk2, "助詞", fmt='chunk') if particle_chunk and chunk2.dst == chunk.idx: particle = [ morph for morph in particle_chunk.morphs if morph.pos == '助詞' ][-1] particles.append(particle) chunk2s.append(chunk2) wo_case_chunk = search_pos(chunk2s, pos="助詞",