def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame: """Align the types of section entities to the most common entity type aggregated by top-section.""" section_types = {} for ts, s_df in df.groupby('TS_text'): section_ents = set(s_df['S_ent'].unique()) type_counter = defaultdict(int) for s_ent in section_ents: for t in dbp_store.get_transitive_types( dbp_util.name2resource(str(s_ent))): type_counter[t] += 1 top_types = dbp_store.get_independent_types({ t for t, cnt in type_counter.items() if cnt == max(type_counter.values()) }) if top_types: top_type = list(top_types)[0] section_types.update({ (ts, se): dbp_util.type2name(top_type) for se in section_ents if top_type in dbp_store.get_transitive_types( dbp_util.name2resource(str(se))) }) section_types = pd.Series(section_types, name='S_enttype_new') df = pd.merge(how='left', left=df, right=section_types, left_on=['TS_text', 'S_ent'], right_index=True) df['S_enttype_new'].fillna(df['S_enttype'], inplace=True) return df.drop(columns='S_enttype').rename( columns={'S_enttype_new': 'S_enttype'})
def _get_resource_surface_scores(text): """Return resource lexicalisation scores for the given text.""" resource_surface_scores = {} if not text: return resource_surface_scores resource_surface_scores[text] = 1 direct_match = dbp_store.resolve_redirect(dbp_util.name2resource(text)) if direct_match in dbp_store.get_resources(): resource_surface_scores[direct_match] = 1 for surface_match, frequency in sorted(dbp_store.get_inverse_lexicalisations(text.lower()).items(), key=operator.itemgetter(1)): resource_surface_scores[surface_match] = frequency return resource_surface_scores
def get_object_for_label(label: str) -> str: """Return the object that fits the given label.""" global __RESOURCE_INVERSE_LABELS__ global __ONTOLOGY_INVERSE_LABELS__ if '__RESOURCE_INVERSE_LABELS__' not in globals(): __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()} ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL) __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()} if label in __ONTOLOGY_INVERSE_LABELS__: return __ONTOLOGY_INVERSE_LABELS__[label] if label in __RESOURCE_INVERSE_LABELS__: return __RESOURCE_INVERSE_LABELS__[label] return dbp_util.name2resource(label)
def get_entity_for_wikilink(wikilink: wtp.WikiLink) -> Optional[str]: if not wikilink.target: return None link_target = _remove_language_tag(wikilink.target.strip()) resource_uri = dbp_util.name2resource(str_util.capitalize(link_target)) redirected_uri = dbp_store.resolve_spelling_redirect(resource_uri) if dbp_store.is_possible_resource( redirected_uri) and '#' not in redirected_uri: # return redirected uri only if it is an own Wikipedia article and it does not point to an article section final_uri = redirected_uri else: final_uri = resource_uri return dbp_util.resource2name(final_uri)
def _assign_entity_types_for_section(df: pd.DataFrame, section_entity: str) -> pd.DataFrame: """Retrieve the types of section entities.""" section_types = {} for ent in df[section_entity].unique(): types = dbp_store.get_independent_types( dbp_store.get_types(dbp_util.name2resource(str(ent)))) if types: section_types[ent] = dbp_util.type2name(list(types)[0]) section_types = pd.Series(section_types, name=f'{section_entity}type') return pd.merge(how='left', left=df, right=section_types, left_on=section_entity, right_index=True)
def get_resource_provenance(self, resource: str) -> set: """Return provenance information of a resource (i.e. which categories and lists have been used to extract it).""" if not self._resource_provenance: for node in self.nodes: for cat in self.get_category_parts(node): for res in cat_store.get_resources(cat): self._resource_provenance[ clg_util.dbp_resource2clg_resource(res)].add(cat) if self.use_listing_resources: for res, res_data in listing.get_page_entities(self).items(): self._resource_provenance[clg_util.name2clg_resource( res)].update({ dbp_util.name2resource(o) for o in res_data['origins'] }) return self._resource_provenance[resource]
def get_resources_from_listings(self, node: str) -> set: if not self._node_listing_resources: for res, res_data in listing.get_page_entities(self).items(): res_nodes = { clg_util.name2clg_type(t) for t in res_data['types'] } res_nodes.update({ n for o in res_data['origins'] for n in self.get_nodes_for_part(dbp_util.name2resource(o)) }) res_uri = clg_util.name2clg_resource(res) for n in res_nodes: self._node_listing_resources[n].add(res_uri) return self._node_listing_resources[node]
def _retrieve_training_data_wle(nlp: Language): listpages = list_store.get_parsed_listpages(wikipedia.ARTICLE_TYPE_ENUM) lp_to_cat_mapping = { lp: list_mapping.get_equivalent_categories(lp) | list_mapping.get_parent_categories(lp) for lp in listpages } lp_to_cat_mapping = { lp: cats for lp, cats in lp_to_cat_mapping.items() if cats } training_data = [] # extract entities for lp, cats in lp_to_cat_mapping.items(): lp_data = listpages[lp] for section_data in lp_data['sections']: for enum_data in section_data['enums']: for entry_data in enum_data: text = entry_data['text'] if not text: continue entities = entry_data['entities'] if not entities: continue valid_entities = [] for entity_data in entities: entity_uri = dbp_util.name2resource( entity_data['name']) entity_tag = _get_tag_for_types( dbp_store.get_types(entity_uri)) if not entity_tag: continue entity_text = entity_data['text'] start = int(entity_data['idx']) end = start + len(text) if end > len(text) or text[start:end] != entity_text: continue valid_entities.append((start, end, entity_tag)) if len(entities) == len(valid_entities): training_data.append( Example.from_dict(nlp.make_doc(text), {'entities': valid_entities})) return training_data
def _extract_axioms(patterns: dict) -> set: """Return the axioms extracted by applying the patterns to Wikipedia categories.""" axioms = {} for cat, (sub, pred, subcats) in patterns.items(): if pred: # simple mapping of label to predicate (case 1) if pred.lower() in predicate_names: axioms[cat] = (sub, predicate_names[pred.lower()], subcats) else: # Voting required to discover Z (case 2) predicate_counts = defaultdict(int) for subcat, value in subcats.items(): value = normalize_val(value) for res in cat_store.get_resources(subcat): for pred, values in dbp_store.get_properties(res).items(): normalized_values = { normalize_val(val) for val in values } if value in normalized_values: predicate_counts[pred] += 1 if predicate_counts: pred = max(predicate_counts.items(), key=operator.itemgetter(1))[0] axioms[cat] = (sub, pred, subcats) # map values to dbpedia resources if necessary (only possible if we have an object property) valid_axioms = {} for cat in axioms: _, pred, subcats = axioms[cat] if dbp_store.is_object_property(pred): for subcat, obj in subcats.items(): obj_uri = dbp_util.name2resource(obj) if obj_uri in dbp_store.get_resources(): if cat in valid_axioms: valid_axioms[cat][1][subcat] = obj_uri else: valid_axioms[cat] = (pred, {subcat: obj_uri}) else: valid_axioms[cat] = (pred, subcats) return {(cat, pred, val) for pred, cat_vals in valid_axioms.values() for cat, val in cat_vals.items()}
def _apply_rules(pattern_dict: dict, cat: str) -> set: """Apply rules form `pattern_dict` and return the implied axioms.""" cat_words = cat_store.get_label(cat).split(' ') axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words) if not axiom_patterns: return set() (pred, pred_type), additional_axioms = axiom_patterns front_pattern_idx = pattern_lengths[0] or None back_pattern_idx = -1 * pattern_lengths[1] or None resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx]) if pred_type: resource = dbp_util.name2resource(resource) if resource not in dbp_store.get_resources( ) or pred_type not in dbp_store.get_transitive_types(resource): return set() return {(cat, pred, resource)} | {(cat, pred, val) for pred, val in additional_axioms}
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame: """Assign (most basic and most specific) page types to the existing dataframe.""" data = [] for page_name in df['P'].unique(): if page_name.startswith('List of'): data.append((page_name, 'List', 'List')) continue page_uri = dbp_util.name2resource(page_name) P_types = dbp_store.get_independent_types( dbp_store.get_types(page_uri)) if not P_types: data.append((page_name, 'Other', 'Other')) continue P_type = sorted(P_types)[0] P_basetype = _get_basetype(P_type) data.append((page_name, dbp_util.type2name(P_type), dbp_util.type2name(P_basetype))) return pd.merge(left=df, right=pd.DataFrame(data, columns=['P', 'P_type', 'P_basetype']), on='P')
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict, graph) -> tuple: positive_SEs, negative_SEs = dict(), set() # compute potential subject entities for list page page_potential_SEs = { dbp_util.resource2name(res) for cat in _get_category_descendants_for_list(page_uri) for res in cat_store.get_resources(cat) } # compute types of list page page_types = { t for n in graph.get_nodes_for_part(page_uri) for t in dbp_store.get_independent_types( graph.get_transitive_dbpedia_types(n)) } page_disjoint_types = { dt for t in page_types for dt in dbp_heur.get_disjoint_types(t) } # collect all linked entities on the page page_entities = { ent['name'] for s in page_data['sections'] for enum in s['enums'] for entry in enum for ent in entry['entities'] } page_entities.update({ ent['name'] for s in page_data['sections'] for table in s['tables'] for row in table['data'] for cell in row for ent in cell['entities'] }) for ent in page_entities: ent_uri = dbp_util.name2resource(ent) if not dbp_store.is_possible_resource(ent_uri): negative_SEs.add(ent) elif ent in page_potential_SEs: positive_SEs[ent] = _compute_entity_label(ent_uri) elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)): negative_SEs.add(ent) return positive_SEs, negative_SEs
def extract_page_entities(graph) -> dict: utils.get_logger().info( f'LISTING/EXTRACT: Extracting types and relations for page entities..') page_entities = defaultdict( lambda: { 'labels': set(), 'origins': set(), 'types': set(), 'in': set(), 'out': set() }) df = context.retrieve_page_entity_context(graph) # extract list page entities utils.get_logger().info( f'LISTING/EXTRACT: Extracting types of list page entities..') df_lps = df[df['P_type'] == 'List'] for lp, df_lp in df_lps.groupby(by='P'): clg_types = { clg_util.clg_type2name(t) for t in graph.get_nodes_for_part(dbp_util.name2resource(lp)) } if clg_types: for _, row in df_lp.iterrows(): name = row['E_ent'] page_entities[name]['labels'].add(row['E_text']) page_entities[name]['origins'].add(lp) page_entities[name]['types'].update(clg_types) df = df.loc[df['P_type'] != 'List'] # ignore list pages in subsequent steps # compute valid combinations of types and NE tags df_types = context.get_entity_types(df, graph) dft = pd.merge(left=df, right=df_types, on='E_ent') valid_tags = context.get_valid_tags_for_entity_types( dft, graph, utils.get_config('listing.valid_tag_threshold')) # extract types utils.get_logger().info( f'LISTING/EXTRACT: Extracting types of page entities..') df_new_types = _compute_new_types(df, dft, df_types, valid_tags) for ent, df_ent in df_new_types.groupby(by='E_ent'): page_entities[ent]['labels'].update(set(df_ent['E_text'].unique())) page_entities[ent]['origins'].update(_get_origins_for_entity(df_ent)) new_types = set(df_ent['E_enttype'].unique()) transitive_types = { clg_util.clg_type2name(tt) for t in new_types for tt in graph.ancestors(clg_util.name2clg_type(t)) } new_types = new_types.difference( transitive_types) # remove transitive types page_entities[ent]['types'].update(new_types) # extract relations utils.get_logger().info( f'LISTING/EXTRACT: Extracting relations of page entities..') df_rels = context.get_entity_relations() df_new_relations = _compute_new_relations(df, df_rels, 'P', valid_tags) df_new_relations = pd.concat([ df_new_relations, _compute_new_relations(df, df_rels, 'TS_ent', valid_tags) ]) df_new_relations = pd.concat([ df_new_relations, _compute_new_relations(df, df_rels, 'S_ent', valid_tags) ]) for ent, df_ent in df_new_relations.groupby(by='E_ent'): page_entities[ent]['labels'].update(set(df_ent['E_text'].unique())) page_entities[ent]['origins'].update(_get_origins_for_entity(df_ent)) rels_in = set( map(tuple, df_ent[~df_ent['inv']][['pred', 'target']].values)) page_entities[ent]['in'].update(rels_in) rels_out = set( map(tuple, df_ent[df_ent['inv']][['pred', 'target']].values)) page_entities[ent]['out'].update(rels_out) return dict(page_entities)
def _parse_raw_markup_from_xml() -> dict: utils.get_logger().info('WIKIPEDIA/XML: Parsing raw markup from XML dump..') parser = etree.XMLParser(target=WikiPageParser()) with bz2.open(utils.get_data_file('files.wikipedia.pages')) as dbp_pages_file: page_markup = etree.parse(dbp_pages_file, parser) return {dbp_util.name2resource(p): markup for p, markup in page_markup.items()}