def preprocess_xml(self, text): document = Document() root = ET.fromstring(text) content = self.concat_elements(root, './text/p') document.add_component(DocumentComponent('content', content)) document.add_metadata('title', self.concat_elements(root, './title')) document.add_metadata('headline', self.concat_elements(root, './headline')) document.add_metadata('author', self.concat_elements(root, './byline')) document.add_metadata('date', root.attrib['date']) return document
return True def retrieve_attribute(token): s = deque([token.lemma_]) cur = token while True: compound = next(filter(lambda x: x.dep_ == 'compound', cur.children), None) if compound is None or not is_valid_attribute_token(compound): break else: cur = compound s.appendleft(compound.lemma_) return " ".join(s) if __name__ == '__main__': vader = Vader() extractor = RuleBasedExtractor(vader) doc = Document() doc.add_component(DocumentComponent(type="text", text=input().strip())) doc = extractor.extract(doc) for ent in doc.entities: print(ent.text, [attr.text for attr in ent.attributes])
def preprocess_txt(self, text): document = Document() document.add_component(DocumentComponent('content', text)) return document