def find_qualifiers(self, fes): """ Finds all FEs that could serve as qualifiers instead of full statements """ qualifiers = defaultdict(list) for fe in fes: if fe['fe'] == 'Time': literal = fe['literal'] value = wikidata.format_date(**literal) qualifiers['P585'].append(value) elif fe['fe'] == 'Duration': literal = fe['literal'] if 'start' in literal: value = wikidata.format_date(**literal['start']) qualifiers['P580'].append(value) if 'end' in literal: value = wikidata.format_date(**literal['end']) qualifiers['P580'].append(value) elif fe['fe'] == 'Place': value = None if 'link' in fe: value = wikidata.wikidata_id_from_wikipedia_url(fe['link']['uri']) if not value: value = wikidata.resolve('P276', fe['chunk'], self.language) if value: qualifiers['P276'].append(value) return qualifiers
def find_qualifiers(self, fes): """ Finds all FEs that could serve as qualifiers instead of full statements """ qualifiers = defaultdict(list) for fe in fes: if fe["fe"] == "Time": literal = fe["literal"] value = wikidata.format_date(**literal) qualifiers["P585"].append(value) elif fe["fe"] == "Duration": literal = fe["literal"] if "start" in literal: value = wikidata.format_date(**literal["start"]) qualifiers["P580"].append(value) if "end" in literal: value = wikidata.format_date(**literal["end"]) qualifiers["P580"].append(value) elif fe["fe"] == "Place": value = None if "link" in fe: value = wikidata.wikidata_id_from_wikipedia_url(fe["link"]["uri"]) if not value: value = wikidata.resolve("P276", fe["chunk"], self.language) if value: qualifiers["P276"].append(value) return qualifiers
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}} continue all_qualifiers = self.find_qualifiers(data['fes']) for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, data): yield True, each else: prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid') if not prop: logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu']) continue chunk_types = set(t[len('http://dbpedia.org/ontology/'):] for t in fe.get('link', {}).get('types')) fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set()) if fe_types and chunk_types and not fe_types & chunk_types: logger.debug('skipping chunk "%s" of fe %s because types do not match, ' 'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types) continue val = None if 'link' in fe: uri = fe['link']['uri'] val = wikidata.wikidata_id_from_wikipedia_url(uri) if not val: val = wikidata.resolve(prop, fe['chunk'], self.language) if not val: val = 'Q19798648' logger.debug('could not resolve chunk "%s" of fe %s (property is %s), ' 'using default value of %s', fe['chunk'], fe['fe'], prop, val) stmt_qualifiers = [] for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []): for qualifier_value in all_qualifiers.get(qualifier_property, []): stmt_qualifiers.extend((qualifier_property, qualifier_value)) yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, qualifiers=stmt_qualifiers, resolve_property=False, resolve_value=False )
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get("url") if not url: logger.warn("skipping item without url") return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {"chunk": name, "additional": {"sentence": data["text"], "url": url}} continue all_qualifiers = self.find_qualifiers(data["fes"]) for fe in data["fes"]: if fe["chunk"] == name: # do not add a statement for the current subject continue if fe["fe"] in ["Time", "Duration"]: for each in self.serialize_numerical(subj, fe, data): yield True, each else: prop = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("wid") if not prop: logger.debug("unknown fe type %s for LU %s, skipping", fe["fe"], data["lu"]) continue chunk_types = set(t[len("http://dbpedia.org/ontology/") :] for t in fe.get("link", {}).get("types")) fe_types = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("types", set()) if fe_types and chunk_types and not fe_types & chunk_types: logger.debug( 'skipping chunk "%s" of fe %s because types do not match, ' "expected: %s actual %s", fe["chunk"], fe["fe"], fe_types, chunk_types, ) continue val = None if "link" in fe: uri = fe["link"]["uri"] val = wikidata.wikidata_id_from_wikipedia_url(uri) if not val: val = wikidata.resolve(prop, fe["chunk"], self.language) if not val: val = "Q19798648" logger.debug( 'could not resolve chunk "%s" of fe %s (property is %s), ' "using default value of %s", fe["chunk"], fe["fe"], prop, val, ) stmt_qualifiers = [] for qualifier_property in self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("qualifiers", []): for qualifier_value in all_qualifiers.get(qualifier_property, []): stmt_qualifiers.extend((qualifier_property, qualifier_value)) yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, qualifiers=stmt_qualifiers, resolve_property=False, resolve_value=False, )