def serialize_numerical(self, subj, fe, url): """ Serializes a numerical FE found by the normalizer """ literal = fe['literal'] if fe['fe'] == 'Time': value = wikidata.format_date(**literal) yield wikidata.finalize_statement(subj, 'P585', value, self.language, url, resolve_property=False, resolve_value=False) elif fe['fe'] == 'Duration': if 'start' in literal: value = wikidata.format_date(**literal['start']) yield wikidata.finalize_statement(subj, 'P580', value, self.language, url, resolve_property=False, resolve_value=False) if 'end' in literal: value = wikidata.format_date(**literal['end']) yield wikidata.finalize_statement(subj, 'P580', value, self.language, url, resolve_property=False, resolve_value=False)
def process_corpus(self, items, output_file, dump_unresolved_file=None, genealogics=None, processes=0): count = skipped = 0 genealogics_url_to_id = {} for success, item in parallel.map(self.serialize_item, items, processes, flatten=True): if success: subj, prop, val, url = item statement = wikidata.finalize_statement( subj, prop, val, self.language, url, resolve_property=False, resolve_value=False ) if not statement: continue output_file.write(statement.encode('utf8')) output_file.write('\n') if genealogics and url.startswith('http://www.genealogics.org/'): genealogics_url_to_id[url] = subj count += 1 if count % 10000 == 0: logger.info('Produced %d statements so far, skipped %d names', count, skipped) else: skipped += 1 if dump_unresolved_file: dump_unresolved_file.write(json.dumps(item)) dump_unresolved_file.write('\n') logger.info('Produced %d statements so far, skipped %d names', count, skipped) return genealogics_url_to_id, count, skipped
def process_row(data): subject = data['emergenza'] resolved = defaultdict(lambda: []) for k, v in data.iteritems(): if COLUMN_TO_PROPERTY.get(k): v = wikidata.resolve(COLUMN_TO_PROPERTY[k], v.decode('utf8'), 'it') if v: resolved[COLUMN_TO_PROPERTY[k]].append(v) info = {k: v for k, v in resolved.iteritems()} subject = wikidata.resolver_with_hints('ddd', subject, 'it', **info) if subject: statements = [] for property, value in resolved.iteritems(): stmt = wikidata.finalize_statement(subject, property, value, 'it', resolve_property=False, resolve_value=False) if stmt is not None: statements.append(stmt) else: logger.warn('could not find the wikidata id of "%s"' % data['emergenza']) statements = None return statements
def serialize_numerical(self, subj, fe, data): """ Serializes a numerical FE found by the normalizer """ literal = fe['literal'] wikidata_property = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid') if not wikidata_property: logger.debug('skipping *numerical* FE of type "%s" and lu "%s"', fe['fe'], data['lu']) return if fe['fe'] == 'Time': value = wikidata.format_date(**literal) yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'], resolve_property=False, resolve_value=False) elif fe['fe'] == 'Duration': if 'start' in literal: value = wikidata.format_date(**literal['start']) yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'], resolve_property=False, resolve_value=False) if 'end' in literal: value = wikidata.format_date(**literal['end']) yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'], resolve_property=False, resolve_value=False)
def resolve_genealogics_family(self, input_file, url_to_id): """ Performs a second pass on genealogics to resolve additional family members """ family_properties = { 'Family': 'P1038', 'Father': 'P22', 'Married': 'P26', 'Mother': 'P25', u'Children\xa0': 'P40', } for row in input_file: data = json.loads(row) if 'url' not in data or data['url'] not in url_to_id: continue subj = url_to_id[data['url']] for key, value in data.get('other', {}).iteritems(): if key in family_properties: prop = family_properties[key] if not isinstance(value, list): logger.debug('unexpected value "%s", property "%s" subject %s', value, key, subj) continue for member in value: for name, url in member.iteritems(): if url in url_to_id: val = url_to_id[url] logger.debug('resolved "%s", %s of/with %s to %s', name.strip(), key, subj, val) statement = wikidata.finalize_statement( subj, prop, val, self.language, data['url'], resolve_property=False, resolve_value=False ) yield True, statement else: logger.debug('skipping "%s" (%s), %s of/with %s', name.strip(), url, key, subj) yield False, name
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn( "Could not resolve Wikidata Item ID of subject '%s'", name) yield False, { 'chunk': name, 'additional': { 'sentence': data['text'], 'url': url } } continue for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, url): yield True, each else: prop = self.fe_to_wid.get(fe['fe']) if not prop: logger.debug('unknown fe type %s, skipping', fe['fe']) continue val = wikidata.resolve(prop, fe['chunk'], self.language) if val: yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, resolve_property=False, resolve_value=False) else: logger.debug( 'could not resolve chunk "%s" of fe %s (property is %s)', fe['chunk'], fe['fe'], prop) yield False, { 'chunk': fe['chunk'], 'additional': { 'fe': fe, 'sentence': data['text'], 'url': url } }
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}} continue all_qualifiers = self.find_qualifiers(data['fes']) for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, data): yield True, each else: prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid') if not prop: logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu']) continue chunk_types = set(t[len('http://dbpedia.org/ontology/'):] for t in fe.get('link', {}).get('types')) fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set()) if fe_types and chunk_types and not fe_types & chunk_types: logger.debug('skipping chunk "%s" of fe %s because types do not match, ' 'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types) continue val = None if 'link' in fe: uri = fe['link']['uri'] val = wikidata.wikidata_id_from_wikipedia_url(uri) if not val: val = wikidata.resolve(prop, fe['chunk'], self.language) if not val: val = 'Q19798648' logger.debug('could not resolve chunk "%s" of fe %s (property is %s), ' 'using default value of %s', fe['chunk'], fe['fe'], prop, val) stmt_qualifiers = [] for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []): for qualifier_value in all_qualifiers.get(qualifier_property, []): stmt_qualifiers.extend((qualifier_property, qualifier_value)) yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, qualifiers=stmt_qualifiers, resolve_property=False, resolve_value=False )