示例#1
0
 def to_triples(self,
                doc,
                content_fields=[
                    'author', 'language', 'issued', 'publisher', 'type',
                    'subject', 'description'
                ]):
     triples = []
     for field in content_fields:
         if field in doc['content'] and doc['content'][field]:
             if field == 'author':
                 doc['content'][field] = self.format_author_name(
                     doc['content'][field])
             triples.append((Entity(doc['docid']), Entity(field),
                             Entity(doc['content'][field])))
     return triples
    def to_triples(self, entity, html):
        soup = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a'))

        triples = []
        for link in soup:
            if link.has_attr('relation'):
                triples.append((self.to_wikipedia_entity(entity),
                                Entity(link['relation']),
                                self.to_wikipedia_entity(link['title'])))

        return triples
示例#3
0
    def load_to_postgres(self, conn, doc):
        # Load entities and relations (knowledge base)
        for (e1, _, e2) in doc.triples:
            logger.debug("%s -[related_to]-> %s" % (e1.label, e2.label))
            source_vertex_id = self.get_or_create_entity_vertex(
                conn, e1, doc_id=doc.doc_id)
            target_vertex_id = self.get_or_create_entity_vertex(conn, e2)
            self.create_edge_postgres(conn, self.next_edge_id, 'related_to',
                                      source_vertex_id, target_vertex_id)
            self.next_edge_id += 1
            metadata = {'name': e1.label}
            if e1.uri:
                metadata['url'] = e1.uri
            # yield Document(doc_id = doc.doc_id, metadata = metadata) # We're only indexing what has a doc_id / XXX
            # this was wrong, because entities never have a doc_id, unless they come from a doc, so just return doc,
            # right?

        tokens = GraphOfEntityBatch.analyze(doc.text)

        for i in range(len(tokens) - 1):
            # Load words, linking by sequential co-occurrence
            logger.debug("%s -[before]-> %s" % (tokens[i], tokens[i + 1]))
            source_vertex_id = self.get_or_create_term_vertex(conn, tokens[i])
            target_vertex_id = self.get_or_create_term_vertex(
                conn, tokens[i + 1])
            self.create_edge_postgres(conn, self.next_edge_id, 'before',
                                      source_vertex_id, target_vertex_id,
                                      {'doc_id': doc.doc_id})
            self.next_edge_id += 1

        doc_entity_labels = set([])
        for e1, _, e2 in doc.triples:
            doc_entity_labels.add(e1.label.lower())
            doc_entity_labels.add(e2.label.lower())

        # Order does not matter / a second loop over unique tokens and entities should help
        for token in set(tokens):
            # Load word-entity occurrence
            source_vertex_id = self.get_or_create_term_vertex(conn, token)
            for entity_label in doc_entity_labels:
                if re.search(r'\b%s\b' % re.escape(token), entity_label):
                    logger.debug("%s -[contained_in]-> %s" %
                                 (token, entity_label))
                    entity_vertex_id = self.get_or_create_entity_vertex(
                        conn, Entity(entity_label))
                    self.create_edge_postgres(conn, self.next_edge_id,
                                              'contained_in', source_vertex_id,
                                              entity_vertex_id)

        conn.commit()

        yield doc
示例#4
0
    def build_triples(self, doc):
        entities = set([])
        triples = set([])

        if not self.include_ae_doc_profile and not self.include_dbpedia:
            return list(entities), list(triples)

        if self.ac_ner is None:
            self.ac_ner = AhoCorasickEntityExtractor(
                "/opt/army-ant/gazetteers/all.txt")

        doc_id = doc['id']
        text = self.to_plain_text(doc, limit=3)
        entities = entities.union(
            Entity(entity_name) for entity_name in self.ac_ner.extract(text))
        entities.add(self.to_washington_post_author_entity(doc['author']))

        if self.include_ae_doc_profile:
            doc_features = self.features.loc[doc['id']]
            for feature_name in doc_features.index:
                if feature_name == 'Keywords':
                    # feature_values = sorted(self.parse_feature_array(
                    #     doc_features[feature_name]), key=lambda kw: kw[1])
                    # feature_values = [k for k, _ in feature_values[0:5]]
                    feature_values = self.parse_feature_array(
                        doc_features[feature_name], dicretized_version=True)
                elif feature_name == 'ReadingComplexity':
                    feature_values = [
                        doc_features[feature_name].split('~', 1)[0]
                    ]
                elif feature_name == 'EmotionCategories':
                    # feature_values = sorted(self.parse_feature_array(
                    #     doc_features[feature_name]), key=lambda kw: kw[1])
                    # feature_values = [k for k, w in feature_values if w >= 0.5]
                    feature_values = self.parse_feature_array(
                        doc_features[feature_name], dicretized_version=True)
                else:
                    feature_values = [doc_features[feature_name]]

                if feature_values is None or len(feature_values) == 0:
                    continue

                for feature_value in feature_values:
                    entities.add(Entity(feature_value))

        if self.include_dbpedia:
            logger.debug(
                "Fetching DBpedia triples for %d entities in document %s" %
                (len(entities), doc_id))

            max_retries = 10

            retries_left = max_retries
            retry_wait = 0

            while True:
                try:
                    dbpedia_triples = list(
                        fetch_dbpedia_triples(
                            [entity.label for entity in entities]))
                    break
                except Exception:
                    if retries_left > 0:
                        retry_wait += 10 * (max_retries - retries_left + 1)
                        logger.exception(
                            "Error retrieving triples for %d entities in document %s, retrying in %d seconds"
                            " (%d retries left)" %
                            (len(entities), doc_id, retry_wait, retries_left))
                        retries_left -= 1
                        time.sleep(retry_wait)
                    else:
                        logger.exception(
                            "Could not retrieve triples for %d entities in document %s, giving up (returning "
                            "%d cached triples)" %
                            (len(entities), doc_id, len(triples)))
                        dbpedia_triples = []
                        break

            for (s, sl), (p, pl), (o, ol) in dbpedia_triples:
                triples.add((Entity(sl, s), Entity(pl, p), Entity(ol, o)))

        return list(entities), list(triples)
示例#5
0
 def to_washington_post_author_entity(self, author_name):
     return Entity(
         author_name, 'https://www.washingtonpost.com/people/%s' %
         (author_name.lower().replace(' ', '-')))
示例#6
0
 def to_wikipedia_entity(self, entity):
     return Entity(
         entity,
         "http://en.wikipedia.org/wiki/%s" % entity.replace(' ', '_'))
示例#7
0
    def build_triples(self, page_id, title, bdy):
        links = set([])
        entities = set([])
        triples = set([])

        for link in bdy.xpath('//link'):
            href = get_first(
                link.xpath(
                    '@xlink:href',
                    namespaces={'xlink': 'http://www.w3.org/1999/xlink'}))
            if href is None:
                continue

            link_match = self.href_to_page_id_re.match(href)
            if link_match:
                links.add(link_match.group(1))

            related_id = inex.xlink_to_page_id(href)

            link_text = get_first(link.xpath('text()'))
            if link_text and len(link_text) < 3:
                link_text = None

            related_title = self.title_index.get(related_id, link_text)
            if related_title is None:
                continue
            related_title = related_title.replace('\n', ' ').strip()

            subj = self.to_wikipedia_entity(page_id, title)
            pred = Entity('related_to')
            obj = self.to_wikipedia_entity(related_id, related_title)

            entities.add(subj)
            entities.add(obj)
            triples.add((subj, pred, obj))

        if self.include_dbpedia:
            logger.debug(
                "Fetching DBpedia triples for %d entities in document %s" %
                (len(entities), page_id))

            max_retries = 10

            retries_left = max_retries
            retry_wait = 0

            while True:
                try:
                    dbpedia_triples = list(
                        fetch_dbpedia_triples(
                            [entity.label for entity in entities]))
                    break
                except Exception:
                    if retries_left > 0:
                        retry_wait += 10 * (max_retries - retries_left + 1)
                        logger.exception(
                            "Error retrieving triples for %d entities in document %s, retrying in %d seconds"
                            " (%d retries left)" %
                            (len(entities), page_id, retry_wait, retries_left))
                        retries_left -= 1
                        time.sleep(retry_wait)
                    else:
                        logger.exception(
                            "Could not retrieve triples for %d entities in document %s, giving up (returning "
                            "%d cached triples)" %
                            (len(entities), page_id, len(triples)))
                        dbpedia_triples = []
                        break

            for (s, sl), (p, pl), (o, ol) in dbpedia_triples:
                triples.add((Entity(sl, s), Entity(pl, p), Entity(ol, o)))

        return list(links), list(entities), list(triples)
示例#8
0
    def to_wikipedia_entity(self, page_id, label):
        # return Entity(label, "https://en.wikipedia.org/wiki/%s" % label.replace(' ', '_'))
        # return Entity(label, "https://en.wikipedia.org/?curid=%s" % page_id)

        # This is the required option for the evaluation module to work
        return Entity(label, "WP%s" % page_id)
 def to_wikipedia_entity(self, label):
     return Entity(
         label, "http://en.wikipedia.org/wiki/%s" % label.replace(" ", "_"))