def genNETypeWordConcept(self, files, datasets, common_dataset, default_dataset): g = input.MultiReader(files, input.DXMLReader) g = input.InputGenerator(g, datasets, default_dataset) g = self.generalizer(g, common_dataset) for da in g: type_counts = ADict() ne_typed = [(ne_type, ) + (g, ) + w for ne_type, g, w in zip( da['ne_typed'], da['generalized'], da['requested'])] filtered = self.filterNEtext(ne_typed) for item in filtered: ne_type = item[0] if ne_type is not None: type_counts[ne_type.upper()] += 1 semantics = da.get('semantics', '') tree = OrderedTree.fromString(semantics) tree_counts = tree.getConceptCounts() bad_ne = set() for concept, count in type_counts.iteritems(): if tree_counts.get(concept, 0) != count: bad_ne.add(concept) ne_types = set(type_counts.keys()) - set(bad_ne) if not bad_ne: splits = tree.splitStateVector(*list(ne_types)) for (ne_type, text), states in zip(filtered, splits): yield ne_type, text, states else: # Some conflicts or no named entities states = tree.toStateVector() for (ne_type, text) in filtered: if ne_type in tree.conceptCounts: only_states = [i for i in states if ne_type in i] yield ne_type, text, only_states else: yield ne_type, text, states
def genNETypeWordConcept(self, files, datasets, common_dataset, default_dataset): g = input.MultiReader(files, input.DXMLReader) g = input.InputGenerator(g, datasets, default_dataset) g = self.generalizer(g, common_dataset) for da in g: type_counts = ADict() ne_typed = [(ne_type,)+(g,)+w for ne_type, g, w in zip(da['ne_typed'], da['generalized'], da['requested'])] filtered = self.filterNEtext(ne_typed) for item in filtered: ne_type = item[0] if ne_type is not None: type_counts[ne_type.upper()] += 1 semantics = da.get('semantics', '') tree = OrderedTree.fromString(semantics) tree_counts = tree.getConceptCounts() bad_ne = set() for concept, count in type_counts.iteritems(): if tree_counts.get(concept, 0) != count: bad_ne.add(concept) ne_types = set(type_counts.keys()) - set(bad_ne) if not bad_ne: splits = tree.splitStateVector(*list(ne_types)) for (ne_type, text), states in zip(filtered, splits): yield ne_type, text, states else: # Some conflicts or no named entities states = tree.toStateVector() for (ne_type, text) in filtered: if ne_type in tree.conceptCounts: only_states = [i for i in states if ne_type in i] yield ne_type, text, only_states else: yield ne_type, text, states
def flushLine(self): self.removeSeparator() ret = ''.join(self) del self[:] return OrderedTree.fromString(ret, label=ROOT_CONCEPT)
def mapFromMLF(cls, contents): str = ' '.join(s.strip() for s in contents) return OrderedTree.fromString(str, label=ROOT_CONCEPT)