示例#1
0
 def create_bundle(self):
     """Create a new bundle and add it at the end of the document."""
     self._highest_bundle_id += 1
     bundle = Bundle(document=self, bundle_id=str(self._highest_bundle_id))
     self.bundles.append(bundle)
     bundle.number = len(self.bundles)
     return bundle
示例#2
0
 def process_document(self, doc):
     tok, tag, par = self.tokenize, self.tag, self.parse
     old_bundles = doc.bundles
     new_bundles = []
     for bundle in old_bundles:
         for tree in bundle:
             new_bundles.append(bundle)
             if self._should_process_tree(tree):
                 if tok:
                     new_trees = self.tool.tokenize_tag_parse_tree(
                         tree,
                         resegment=self.resegment,
                         tag=self.tag,
                         parse=self.parse)
                     if self.resegment and len(new_trees) > 1:
                         orig_bundle_id = bundle.bundle_id
                         bundle.bundle_id = orig_bundle_id + '-1'
                         tree.text = None
                         for i, new_tree in enumerate(new_trees[1:], 2):
                             new_bundle = Bundle(document=doc,
                                                 bundle_id=orig_bundle_id +
                                                 '-' + str(i))
                             new_tree.zone = tree.zone
                             new_bundle.add_tree(new_tree)
                             new_bundles.append(new_bundle)
                 elif not tok and tag and par:
                     self.tool.tag_parse_tree(tree)
                 else:
                     raise ValueError(
                         "Unimplemented tokenize=%s tag=%s parse=%s" %
                         (tok, tag, par))
     doc.bundles = new_bundles
示例#3
0
 def create_bundle(self):
     """Create a new bundle and add it at the end of the document."""
     self._highest_bundle_id += 1
     bundle = Bundle(document=self, bundle_id=str(self._highest_bundle_id))
     self.bundles.append(bundle)
     bundle.number = len(self.bundles)
     return bundle
示例#4
0
 def process_document(self, doc):
     old_bundles = doc.bundles
     new_bundles = []
     for bundle in old_bundles:
         new_bundles.append(bundle)
         for tree in bundle:
             if self._should_process_tree(tree):
                 if tree.children:
                     raise ValueError("Segmenting already tokenized text is not supported.")
                 sentences = self.segment_string(tree.text)
                 orig_bundle_id = bundle.bundle_id
                 bundle.bundle_id = orig_bundle_id + '-1'
                 if len(sentences) > 1:
                     tree.text = sentences[0]
                     for i, sentence in enumerate(sentences[1:], 2):
                         new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i))
                         new_bundle.create_tree(tree.zone).text = sentence
                         new_bundles.append(new_bundle)
     doc.bundles = new_bundles