def to_triples(self, doc, content_fields=[ 'author', 'language', 'issued', 'publisher', 'type', 'subject', 'description' ]): triples = [] for field in content_fields: if field in doc['content'] and doc['content'][field]: if field == 'author': doc['content'][field] = self.format_author_name( doc['content'][field]) triples.append((Entity(doc['docid']), Entity(field), Entity(doc['content'][field]))) return triples
def to_triples(self, entity, html): soup = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a')) triples = [] for link in soup: if link.has_attr('relation'): triples.append((self.to_wikipedia_entity(entity), Entity(link['relation']), self.to_wikipedia_entity(link['title']))) return triples
def load_to_postgres(self, conn, doc): # Load entities and relations (knowledge base) for (e1, _, e2) in doc.triples: logger.debug("%s -[related_to]-> %s" % (e1.label, e2.label)) source_vertex_id = self.get_or_create_entity_vertex( conn, e1, doc_id=doc.doc_id) target_vertex_id = self.get_or_create_entity_vertex(conn, e2) self.create_edge_postgres(conn, self.next_edge_id, 'related_to', source_vertex_id, target_vertex_id) self.next_edge_id += 1 metadata = {'name': e1.label} if e1.uri: metadata['url'] = e1.uri # yield Document(doc_id = doc.doc_id, metadata = metadata) # We're only indexing what has a doc_id / XXX # this was wrong, because entities never have a doc_id, unless they come from a doc, so just return doc, # right? tokens = GraphOfEntityBatch.analyze(doc.text) for i in range(len(tokens) - 1): # Load words, linking by sequential co-occurrence logger.debug("%s -[before]-> %s" % (tokens[i], tokens[i + 1])) source_vertex_id = self.get_or_create_term_vertex(conn, tokens[i]) target_vertex_id = self.get_or_create_term_vertex( conn, tokens[i + 1]) self.create_edge_postgres(conn, self.next_edge_id, 'before', source_vertex_id, target_vertex_id, {'doc_id': doc.doc_id}) self.next_edge_id += 1 doc_entity_labels = set([]) for e1, _, e2 in doc.triples: doc_entity_labels.add(e1.label.lower()) doc_entity_labels.add(e2.label.lower()) # Order does not matter / a second loop over unique tokens and entities should help for token in set(tokens): # Load word-entity occurrence source_vertex_id = self.get_or_create_term_vertex(conn, token) for entity_label in doc_entity_labels: if re.search(r'\b%s\b' % re.escape(token), entity_label): logger.debug("%s -[contained_in]-> %s" % (token, entity_label)) entity_vertex_id = self.get_or_create_entity_vertex( conn, Entity(entity_label)) self.create_edge_postgres(conn, self.next_edge_id, 'contained_in', source_vertex_id, entity_vertex_id) conn.commit() yield doc
def build_triples(self, doc): entities = set([]) triples = set([]) if not self.include_ae_doc_profile and not self.include_dbpedia: return list(entities), list(triples) if self.ac_ner is None: self.ac_ner = AhoCorasickEntityExtractor( "/opt/army-ant/gazetteers/all.txt") doc_id = doc['id'] text = self.to_plain_text(doc, limit=3) entities = entities.union( Entity(entity_name) for entity_name in self.ac_ner.extract(text)) entities.add(self.to_washington_post_author_entity(doc['author'])) if self.include_ae_doc_profile: doc_features = self.features.loc[doc['id']] for feature_name in doc_features.index: if feature_name == 'Keywords': # feature_values = sorted(self.parse_feature_array( # doc_features[feature_name]), key=lambda kw: kw[1]) # feature_values = [k for k, _ in feature_values[0:5]] feature_values = self.parse_feature_array( doc_features[feature_name], dicretized_version=True) elif feature_name == 'ReadingComplexity': feature_values = [ doc_features[feature_name].split('~', 1)[0] ] elif feature_name == 'EmotionCategories': # feature_values = sorted(self.parse_feature_array( # doc_features[feature_name]), key=lambda kw: kw[1]) # feature_values = [k for k, w in feature_values if w >= 0.5] feature_values = self.parse_feature_array( doc_features[feature_name], dicretized_version=True) else: feature_values = [doc_features[feature_name]] if feature_values is None or len(feature_values) == 0: continue for feature_value in feature_values: entities.add(Entity(feature_value)) if self.include_dbpedia: logger.debug( "Fetching DBpedia triples for %d entities in document %s" % (len(entities), doc_id)) max_retries = 10 retries_left = max_retries retry_wait = 0 while True: try: dbpedia_triples = list( fetch_dbpedia_triples( [entity.label for entity in entities])) break except Exception: if retries_left > 0: retry_wait += 10 * (max_retries - retries_left + 1) logger.exception( "Error retrieving triples for %d entities in document %s, retrying in %d seconds" " (%d retries left)" % (len(entities), doc_id, retry_wait, retries_left)) retries_left -= 1 time.sleep(retry_wait) else: logger.exception( "Could not retrieve triples for %d entities in document %s, giving up (returning " "%d cached triples)" % (len(entities), doc_id, len(triples))) dbpedia_triples = [] break for (s, sl), (p, pl), (o, ol) in dbpedia_triples: triples.add((Entity(sl, s), Entity(pl, p), Entity(ol, o))) return list(entities), list(triples)
def to_washington_post_author_entity(self, author_name): return Entity( author_name, 'https://www.washingtonpost.com/people/%s' % (author_name.lower().replace(' ', '-')))
def to_wikipedia_entity(self, entity): return Entity( entity, "http://en.wikipedia.org/wiki/%s" % entity.replace(' ', '_'))
def build_triples(self, page_id, title, bdy): links = set([]) entities = set([]) triples = set([]) for link in bdy.xpath('//link'): href = get_first( link.xpath( '@xlink:href', namespaces={'xlink': 'http://www.w3.org/1999/xlink'})) if href is None: continue link_match = self.href_to_page_id_re.match(href) if link_match: links.add(link_match.group(1)) related_id = inex.xlink_to_page_id(href) link_text = get_first(link.xpath('text()')) if link_text and len(link_text) < 3: link_text = None related_title = self.title_index.get(related_id, link_text) if related_title is None: continue related_title = related_title.replace('\n', ' ').strip() subj = self.to_wikipedia_entity(page_id, title) pred = Entity('related_to') obj = self.to_wikipedia_entity(related_id, related_title) entities.add(subj) entities.add(obj) triples.add((subj, pred, obj)) if self.include_dbpedia: logger.debug( "Fetching DBpedia triples for %d entities in document %s" % (len(entities), page_id)) max_retries = 10 retries_left = max_retries retry_wait = 0 while True: try: dbpedia_triples = list( fetch_dbpedia_triples( [entity.label for entity in entities])) break except Exception: if retries_left > 0: retry_wait += 10 * (max_retries - retries_left + 1) logger.exception( "Error retrieving triples for %d entities in document %s, retrying in %d seconds" " (%d retries left)" % (len(entities), page_id, retry_wait, retries_left)) retries_left -= 1 time.sleep(retry_wait) else: logger.exception( "Could not retrieve triples for %d entities in document %s, giving up (returning " "%d cached triples)" % (len(entities), page_id, len(triples))) dbpedia_triples = [] break for (s, sl), (p, pl), (o, ol) in dbpedia_triples: triples.add((Entity(sl, s), Entity(pl, p), Entity(ol, o))) return list(links), list(entities), list(triples)
def to_wikipedia_entity(self, page_id, label): # return Entity(label, "https://en.wikipedia.org/wiki/%s" % label.replace(' ', '_')) # return Entity(label, "https://en.wikipedia.org/?curid=%s" % page_id) # This is the required option for the evaluation module to work return Entity(label, "WP%s" % page_id)
def to_wikipedia_entity(self, label): return Entity( label, "http://en.wikipedia.org/wiki/%s" % label.replace(" ", "_"))