示例#1
0
    def preprocess_xml(self, text):
        document = Document()

        root = ET.fromstring(text)

        content = self.concat_elements(root, './text/p')
        document.add_component(DocumentComponent('content', content))

        document.add_metadata('title', self.concat_elements(root, './title'))
        document.add_metadata('headline',
                              self.concat_elements(root, './headline'))
        document.add_metadata('author', self.concat_elements(root, './byline'))
        document.add_metadata('date', root.attrib['date'])

        return document
示例#2
0
    return True


def retrieve_attribute(token):
    s = deque([token.lemma_])
    cur = token

    while True:
        compound = next(filter(lambda x: x.dep_ == 'compound', cur.children),
                        None)

        if compound is None or not is_valid_attribute_token(compound):
            break
        else:
            cur = compound
            s.appendleft(compound.lemma_)

    return " ".join(s)


if __name__ == '__main__':
    vader = Vader()
    extractor = RuleBasedExtractor(vader)

    doc = Document()
    doc.add_component(DocumentComponent(type="text", text=input().strip()))

    doc = extractor.extract(doc)
    for ent in doc.entities:
        print(ent.text, [attr.text for attr in ent.attributes])
示例#3
0
    def preprocess_txt(self, text):
        document = Document()

        document.add_component(DocumentComponent('content', text))
        return document