def genNETypeWordConcept(self, files, datasets, common_dataset,
                             default_dataset):
        g = input.MultiReader(files, input.DXMLReader)
        g = input.InputGenerator(g, datasets, default_dataset)
        g = self.generalizer(g, common_dataset)
        for da in g:
            type_counts = ADict()

            ne_typed = [(ne_type, ) + (g, ) + w for ne_type, g, w in zip(
                da['ne_typed'], da['generalized'], da['requested'])]

            filtered = self.filterNEtext(ne_typed)

            for item in filtered:
                ne_type = item[0]
                if ne_type is not None:
                    type_counts[ne_type.upper()] += 1

            semantics = da.get('semantics', '')
            tree = OrderedTree.fromString(semantics)
            tree_counts = tree.getConceptCounts()
            bad_ne = set()
            for concept, count in type_counts.iteritems():
                if tree_counts.get(concept, 0) != count:
                    bad_ne.add(concept)

            ne_types = set(type_counts.keys()) - set(bad_ne)

            if not bad_ne:
                splits = tree.splitStateVector(*list(ne_types))
                for (ne_type, text), states in zip(filtered, splits):
                    yield ne_type, text, states
            else:
                # Some conflicts or no named entities
                states = tree.toStateVector()
                for (ne_type, text) in filtered:
                    if ne_type in tree.conceptCounts:
                        only_states = [i for i in states if ne_type in i]
                        yield ne_type, text, only_states
                    else:
                        yield ne_type, text, states
    def genNETypeWordConcept(self, files, datasets, common_dataset, default_dataset):
        g = input.MultiReader(files, input.DXMLReader)
        g = input.InputGenerator(g, datasets, default_dataset)
        g = self.generalizer(g, common_dataset)
        for da in g:
            type_counts = ADict()

            ne_typed = [(ne_type,)+(g,)+w for ne_type, g, w in 
                            zip(da['ne_typed'], da['generalized'], da['requested'])]

            filtered = self.filterNEtext(ne_typed)

            for item in filtered:
                ne_type = item[0]
                if ne_type is not None:
                    type_counts[ne_type.upper()] += 1

            semantics = da.get('semantics', '')
            tree = OrderedTree.fromString(semantics)
            tree_counts = tree.getConceptCounts()
            bad_ne = set()
            for concept, count in type_counts.iteritems():
                if tree_counts.get(concept, 0) != count:
                    bad_ne.add(concept)

            ne_types = set(type_counts.keys()) - set(bad_ne)

            if not bad_ne:
                splits = tree.splitStateVector(*list(ne_types))
                for (ne_type, text), states in zip(filtered, splits):
                    yield ne_type, text, states
            else:
                # Some conflicts or no named entities
                states = tree.toStateVector()
                for (ne_type, text) in filtered:
                    if ne_type in tree.conceptCounts:
                        only_states = [i for i in states if ne_type in i]
                        yield ne_type, text, only_states
                    else:
                        yield ne_type, text, states
Exemplo n.º 3
0
 def flushLine(self):
     self.removeSeparator()
     ret = ''.join(self)
     del self[:]
     return OrderedTree.fromString(ret, label=ROOT_CONCEPT)
Exemplo n.º 4
0
 def mapFromMLF(cls, contents):
     str = ' '.join(s.strip() for s in contents)
     return OrderedTree.fromString(str, label=ROOT_CONCEPT)