| select(lambda nums: [words[num] for num in nums]) \ | select(lambda words: ''.join(words)) \ | as_list short_phrase = ''.join(short_matches) first_index = matches | select(extract_nums) | chain | min last_index = matches | select(extract_nums) | chain | max complete_phrase = ''.join(words[first_index:last_index + 1]) index = cur_index + ( [len(words[i]) for i in range(first_index)] | add) yield (short_phrase, complete_phrase, index) cur_index += len(line) + 1 if __name__ == '__main__': from pynlpini import PosTagger from pynlpini import SegTagger with open("../../data/app/travel_comments/mafengwo_comments_raw.txt" ) as comment_file: index = 0 ie = ImpressionExtractor(PosTagger(SegTagger())) for line in comment_file: line = line.decode("utf-8") for item in ie.extract(line): print item index += 1 if index > 10: exit()
def setUp(self): self.extractor = ImpressionExtractor(PosTagger(SegTagger()))
def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger( SegTagger(), os.path.join(base_dir, "./model/pos.crf.model"))