def _find_Y(X: Span, subcat_uri: str): """Return Y if the category follows one of the patterns 'YX' or 'X <prep> Y'.""" if X.text.lower() not in cat_store.get_label(subcat_uri).lower(): return None subcat = nlp_util.parse(cat_store.get_label(subcat_uri)) if subcat.text.lower().endswith(' ' + X.text.lower()): # "YX" if len(X) >= len(subcat) or subcat[-(len(X) + 1)].pos_ == 'ADP': return None return subcat[:-len(X)] elif subcat.text.lower().startswith(X.text.lower() + ' '): # "X <prep> Y" adp_indices = [w.i for w in subcat if w.pos_ == 'ADP'] if len(adp_indices) != 1: return None adp_index = adp_indices[0] Y = subcat[adp_index + 1:] if subcat[adp_index].text == 'by': childcats = cat_store.get_children(subcat_uri) resources = cat_store.get_resources(subcat_uri) predicate_labels = { dbp_store.get_label(pred) for res in resources for pred in dbp_store.get_properties(res) } if len(childcats) * 10 >= len(resources) or any( Y.text.lower() in p for p in predicate_labels): return None return Y return None
def _extract_axioms(category_graph, patterns): """Return axioms extracted from `category_graph` by applying `patterns` to all categories.""" utils.get_logger().debug('CATEGORY/CAT2AX: Extracting axioms..') category_axioms = defaultdict(list) # process front/back/front+back patterns individually to reduce computational complexity front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, False).items(): _fill_dict(front_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, False, True).items(): _fill_dict(back_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, True).items(): _fill_dict(enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) cat_contexts = [( cat, nlp_util.remove_by_phrase(cat_store.get_label(cat)), cat_store.get_statistics(cat), front_pattern_dict, back_pattern_dict, enclosing_pattern_dict ) for cat in category_graph.content_nodes] with mp.Pool(processes=utils.get_config('max_cpus')) as pool: category_axioms = {cat: axioms for cat, axioms in tqdm(pool.imap_unordered(_extract_axioms_for_cat, cat_contexts, chunksize=1000), total=len(cat_contexts), desc='CATEGORY/CAT2AX: Extracting axioms')} category_axioms = {cat: axioms for cat, axioms in category_axioms.items() if axioms} # filter out empty axioms utils.get_logger().debug(f'CATEGORY/CAT2AX: Extracted {sum(len(axioms) for axioms in category_axioms.values())} axioms for {len(category_axioms)} categories.') return category_axioms
def make_conceptual(self): """Remove all nodes that are non-conceptual (i.e. that do not represent a class in a taxonomy).""" cat_names = [cat_store.get_label(cat) for cat in self.nodes] conceptual_categories = { cat for cat, has_plural_lexhead in zip( self.nodes, nlp_util.has_plural_lexhead_subjects(cat_names)) if has_plural_lexhead } # clearing the graph of any invalid nodes self._remove_all_nodes_except(conceptual_categories | {self.root_node}) return self
def _compute_category_sets() -> dict: """Iterate over DBpedia categories and identify all category sets. 1) Retrieve all usable categories (i.e. categories that are not used for maintenance/organisational purposes) 2) Normalize their names by removing by-phrases (e.g. "X by genre", "Y by country") 3) For each category, retrieve all its children and search for name patterns (see '_find_child_sets') """ category_sets = {} for cat in cat_store.get_categories(): children_docs = {c: nlp_util.remove_by_phrase(cat_store.get_label(c)) for c in cat_store.get_children(cat)} child_sets = _find_child_sets(cat, children_docs) if child_sets: category_sets[cat] = child_sets return category_sets
def _apply_rules(pattern_dict: dict, cat: str) -> set: """Apply rules form `pattern_dict` and return the implied axioms.""" cat_words = cat_store.get_label(cat).split(' ') axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words) if not axiom_patterns: return set() (pred, pred_type), additional_axioms = axiom_patterns front_pattern_idx = pattern_lengths[0] or None back_pattern_idx = -1 * pattern_lengths[1] or None resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx]) if pred_type: resource = dbp_util.name2resource(resource) if resource not in dbp_store.get_resources( ) or pred_type not in dbp_store.get_transitive_types(resource): return set() return {(cat, pred, resource)} | {(cat, pred, val) for pred, val in additional_axioms}
def parse_category(category: str) -> Doc: """Return the category name as parsed Doc.""" label = cat_store.get_label(category) return nlp_util.parse(label)
def _extract_axioms_with_rules(cat_dfs: dict) -> set: """Return axioms genered by applying C-DF rules.""" # generate rule candidates by extracting shared pre-/postfixes cdf_rule_candidates = defaultdict(lambda: defaultdict(lambda: 0)) for cat, (df, _) in cat_dfs.items(): cat_label = cat_store.get_label(cat) for f in {f for f in df if f[0] != rdf_util.PREDICATE_TYPE}: if dbp_util.is_dbp_resource(f[1]): f_label = dbp_store._get_label_mapping()[ f[1]] if f[1] in dbp_store._get_label_mapping( ) else dbp_util.object2name(f[1]) else: f_label = f[1] if f_label in cat_label: first_words = cat_label[:cat_label.index(f_label)].strip() first_words = tuple( first_words.split(' ')) if first_words else tuple() last_words = cat_label[cat_label.index(f_label) + len(f_label):].strip() last_words = tuple( last_words.split(' ')) if last_words else tuple() if first_words or last_words: f_types = dbp_store.get_independent_types( dbp_store.get_types(f[1])) if dbp_util.is_dbp_resource( f[1]) else set() f_type = f_types.pop() if f_types else None cdf_rule_candidates[(first_words, last_words)][((f[0], f_type), tuple( set(df).difference( {f})))] += 1 # filter rules using the threshold parameters min_support and beta cdf_rules = {} min_support = util.get_config('cdf.min_support') beta = util.get_config('cdf.beta') for word_patterns in cdf_rule_candidates: total_support = sum(cdf_rule_candidates[word_patterns].values()) valid_axiom_patterns = [ pattern for pattern, support in cdf_rule_candidates[word_patterns].items() if support >= min_support and (support / total_support) >= beta ] if len(valid_axiom_patterns) > 0: cdf_rules[word_patterns] = valid_axiom_patterns[0] # apply the patterns to all categories in order to extract axioms # (the rules are applied individually depending on whether the pattern is at the front, back, or front+back in order to reduce computational complexity) cdf_front_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if word_patterns[0] and not word_patterns[1] } cdf_front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_front_patterns.items(): _fill_dict( cdf_front_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) cdf_back_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if not word_patterns[0] and word_patterns[1] } cdf_back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_back_patterns.items(): _fill_dict( cdf_back_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) cdf_enclosing_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if word_patterns[0] and word_patterns[1] } cdf_enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_enclosing_patterns.items(): _fill_dict( cdf_enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) rule_axioms = set() for cat in cat_store.get_usable_cats(): rule_axioms.update(_apply_rules(cdf_front_pattern_dict, cat)) rule_axioms.update(_apply_rules(cdf_back_pattern_dict, cat)) rule_axioms.update(_apply_rules(cdf_enclosing_pattern_dict, cat)) return rule_axioms
def _get_match_for_category(category: str, first_words: tuple, last_words: tuple) -> str: """Return variable part of the category name.""" doc = nlp_util.remove_by_phrase(cat_store.get_label(category)) return doc[len(first_words):len(doc)-len(last_words)].text