def create_sentences(list_of_lists, document, segstorage): plain_sentences = [u' '.join(lists.word) for lists in list_of_lists] segments = [] starts, ends = compute_starts(plain_sentences, u' '), compute_ends(plain_sentences, u' ') for sentence, start, end in izip(plain_sentences, starts, ends): segments.append(Segment(u'sentence', sentence, document, start, end)) segstorage.save(segments)
def _create_sentence_segments(self, segmentstorage): '''Create segments that denote the sentences in the document and store them.''' starts = compute_starts(self._plain_sentences) ends = compute_ends(self._plain_sentences) segments = [] for sentence, start, end in zip(self._plain_sentences, starts, ends): segments.append(Segment(u'sentence', sentence, self._document, start, end)) segmentstorage.save(segments)
def _process_line(self, line, doc_idx): tokens = line.split() starts, ends = compute_starts(tokens, u" "), compute_ends(tokens, u" ") document = Document(self._doc_prefix + u":" + str(doc_idx), line, {}) segments = [] for tok, start, end in zip(tokens, starts, ends): segment = Segment(u"token", tok, document, start, end) segments.append(segment) self._docstorage.save(document) self._segstorage.save(segments)
def _end_of_document(self, sentences): lists = Lists.concatenate(sentences) # create document doc_text = u' '.join(lists.word) document = Document(self._name_prefix + unicode(self._doc_idx), doc_text) self._documentstorage.save(document) self._doc_idx += 1 # create segments starts, ends = compute_starts(lists.word, u' '), compute_ends(lists.word, u' ') create_segments(u'word', lists.word, starts, ends, document, self._segmentstorage) create_segments(u'lemma', lists.lemma, starts, ends, document, self._segmentstorage) create_segments(u'case', lists.case, starts, ends, document, self._segmentstorage) create_segments(u'pos', lists.pos, starts, ends, document, self._segmentstorage) create_ne_segments(lists.ne_type, starts, ends, document, self._segmentstorage) # create sentece segments create_sentences(sentences, document, self._segmentstorage) sentences[:] = []
def _create_word_sentences(self): '''Parse the words from tokens.''' self._word_sentences = [[token['sone'].decode('unicode_escape', 'replace') for token in sentence] for sentence in self._token_sentences] self._words = reduce(lambda x, y: x + y, self._word_sentences) self._word_starts = compute_starts(self._words, u' ') self._word_ends = compute_ends(self._words, u' ')
def test_zero_starts(self): self.assertEqual(compute_starts([]), []) self.assertEqual(compute_starts([], sep=u' '), [])
def test_starts(self): self.assertEqual(compute_starts(self.tokens(), self.sep()), self.starts())