Пример #1
0
def create_sentences(list_of_lists, document, segstorage):
    plain_sentences = [u' '.join(lists.word) for lists in list_of_lists]
    segments = []
    starts, ends = compute_starts(plain_sentences, u' '), compute_ends(plain_sentences, u' ')
    for sentence, start, end in izip(plain_sentences, starts, ends):
        segments.append(Segment(u'sentence', sentence, document, start, end))
    segstorage.save(segments)
Пример #2
0
 def _create_sentence_segments(self, segmentstorage):
     '''Create segments that denote the sentences in the document and store them.'''
     starts = compute_starts(self._plain_sentences)
     ends = compute_ends(self._plain_sentences)
     segments = []
     for sentence, start, end in zip(self._plain_sentences, starts, ends):
         segments.append(Segment(u'sentence', sentence, self._document, start, end))
     segmentstorage.save(segments)
Пример #3
0
 def _process_line(self, line, doc_idx):
     tokens = line.split()
     starts, ends = compute_starts(tokens, u" "), compute_ends(tokens, u" ")
     document = Document(self._doc_prefix + u":" + str(doc_idx), line, {})
     segments = []
     for tok, start, end in zip(tokens, starts, ends):
         segment = Segment(u"token", tok, document, start, end)
         segments.append(segment)
     self._docstorage.save(document)
     self._segstorage.save(segments)
Пример #4
0
 def _end_of_document(self, sentences):
     lists = Lists.concatenate(sentences)
     # create document
     doc_text = u' '.join(lists.word)
     document = Document(self._name_prefix + unicode(self._doc_idx), doc_text)
     self._documentstorage.save(document)
     self._doc_idx += 1
     
     # create segments
     starts, ends = compute_starts(lists.word, u' '), compute_ends(lists.word, u' ')
     create_segments(u'word', lists.word, starts, ends, document, self._segmentstorage)
     create_segments(u'lemma', lists.lemma, starts, ends, document, self._segmentstorage)
     create_segments(u'case', lists.case, starts, ends, document, self._segmentstorage)
     create_segments(u'pos', lists.pos, starts, ends, document, self._segmentstorage)
     create_ne_segments(lists.ne_type, starts, ends, document, self._segmentstorage)
     
     # create sentece segments
     create_sentences(sentences, document, self._segmentstorage)
     
     sentences[:] = []
Пример #5
0
 def _create_word_sentences(self):
     '''Parse the words from tokens.'''
     self._word_sentences = [[token['sone'].decode('unicode_escape', 'replace') for token in sentence] for sentence in self._token_sentences]
     self._words = reduce(lambda x, y: x + y, self._word_sentences)
     self._word_starts = compute_starts(self._words, u' ')
     self._word_ends = compute_ends(self._words, u' ')
Пример #6
0
 def test_zero_starts(self):
     self.assertEqual(compute_starts([]), [])
     self.assertEqual(compute_starts([], sep=u'   '), [])
Пример #7
0
 def test_starts(self):
     self.assertEqual(compute_starts(self.tokens(), self.sep()), self.starts())