def link_to_military_units(graph, target_prop, source_prop): """ Link military units to known matching military units in Warsa :returns dict containing some statistics and a list of errors :type graph: rdflib.Graph :param target_prop: target property to use for new links :param source_prop: source property as URIRef """ arpa = Arpa('http://demo.seco.tkk.fi/arpa/menehtyneet_units') # Query the ARPA service and add the matches return arpafy(graph, target_prop, arpa, source_prop, preprocessor=_create_unit_abbreviations, progress=True, retry_amount=50)
def link_to_pnr(graph, graph_schema, target_prop, source_prop): """ Link municipalities to Paikannimirekisteri. :returns dict containing some statistics and a list of errors :type graph: rdflib.Graph :param target_prop: target property to use for new links :param source_prop: source property as URIRef """ def _get_municipality_label(uri, *args): """ :param uri: municipality URI """ return str(graph_schema.value(uri, URIRef('http://www.w3.org/2004/02/skos/core#prefLabel'))).replace('/', ' ') arpa = Arpa('http://demo.seco.tkk.fi/arpa/pnr_municipality') # Query the ARPA service and add the matches return arpafy(graph, target_prop, arpa, source_prop, preprocessor=_get_municipality_label, progress=True, retry_amount=50)
def link_to_warsa_persons(graph_data, graph_schema, target_prop, source_rank_prop, source_firstname_prop, source_lastname_prop, birthdate_prop, deathdate_prop, preprocessor=None, validator=None, endpoint='http://demo.seco.tkk.fi/arpa/warsa_actor_persons'): """ Link a person to known Warsa persons :param graph_data: RDF graph where the names and such are found :type graph_data: rdflib.Graph :param graph_schema: RDF graph where is the military rank label :type graph_schema: rdflib.Graph :param target_prop: target property to use for new links :param source_rank_prop: military rank property :param source_fullname_prop: full name property """ # TODO: sotilasarvolabel --> rank_label, pisteytys pitää säätää uusiksi (tarkasta) # TODO: henkilöllä voi olla useita sotilasarvoja, vertailu kaikkiin (tarkasta toimiiko) # TODO: ARPAn sijaan voisi kysellä suoraan SPARQL-kyselyllä kandidaatit def _validator(graph, s): def _validate_name(text, results): if not results: return results rank = graph.value(s, source_rank_prop) rank = str(graph_schema.value(rank, URIRef('http://www.w3.org/2004/02/skos/core#prefLabel'))).lower() firstnames = str(graph.value(s, source_firstname_prop)).replace('/', ' ').lower().split() lastname = text.lower() filtered = [] _fuzzy_lastname_match_limit = 50 _fuzzy_firstname_match_limit = 60 for person in results: score = 0 res_id = None try: res_id = person['properties'].get('id')[0].replace('"', '') res_ranks = [rank.replace('"', '').lower() for rank in person['properties'].get('rank_label', [''])] res_lastname = person['properties'].get('sukunimi')[0].replace('"', '').lower() res_firstnames = person['properties'].get('etunimet')[0].split('^')[0].replace('"', '').lower() res_firstnames = res_firstnames.split() res_birthdates = (min(person['properties'].get('birth_start', [''])).split('^')[0].replace('"', ''), max(person['properties'].get('birth_end', [''])).split('^')[0].replace('"', '')) res_deathdates = (min(person['properties'].get('death_start', [''])).split('^')[0].replace('"', ''), max(person['properties'].get('death_end', [''])).split('^')[0].replace('"', '')) except TypeError: log.info('Unable to read data for validation for {uri} , skipping result...'.format(uri=res_id)) continue log.debug('Potential match for person {p1text} <{p1}> : {p2text} {p2}'. format(p1text=' '.join([rank] + firstnames + [lastname]), p1=s, p2text=' '.join([' '.join([res_ranks])] + res_firstnames + [res_lastname]), p2=res_id)) fuzzy_lastname_match = fuzz.token_set_ratio(lastname, res_lastname, force_ascii=False) if fuzzy_lastname_match >= _fuzzy_lastname_match_limit: log.debug('Fuzzy last name match for {f1} and {f2}: {fuzzy}' .format(f1=lastname, f2=res_lastname, fuzzy=fuzzy_lastname_match)) score += int((fuzzy_lastname_match - _fuzzy_lastname_match_limit) / (100 - _fuzzy_lastname_match_limit) * 100) if rank and res_ranks and rank != 'tuntematon': if rank in res_ranks: score += 25 else: score -= 25 birthdate = str(graph.value(s, birthdate_prop)) deathdate = str(graph.value(s, deathdate_prop)) if res_birthdates[0] and birthdate: if res_birthdates[0] <= birthdate: if res_birthdates[0] == birthdate: score += 50 else: score -= 25 if res_birthdates[1] and birthdate: if birthdate <= res_birthdates[1]: if res_birthdates[1] == birthdate: score += 50 else: score -= 25 if res_deathdates[0] and deathdate: if res_deathdates[0] <= deathdate: if res_deathdates[0] == deathdate: score += 50 else: score -= 25 if res_deathdates[1] and deathdate: if deathdate <= res_deathdates[1]: if deathdate == res_deathdates[1]: score += 50 else: score -= 25 s_first1 = ' '.join(firstnames) s_first2 = ' '.join(res_firstnames) fuzzy_firstname_match = max(fuzz.partial_ratio(s_first1, s_first2), fuzz.token_sort_ratio(s_first1, s_first2, force_ascii=False), fuzz.token_set_ratio(s_first1, s_first2, force_ascii=False)) if fuzzy_firstname_match >= _fuzzy_firstname_match_limit: log.debug('Fuzzy first name match for {f1} and {f2}: {fuzzy}' .format(f1=firstnames, f2=res_firstnames, fuzzy=fuzzy_firstname_match)) score += int((fuzzy_firstname_match - _fuzzy_firstname_match_limit) / (100 - _fuzzy_firstname_match_limit) * 100) else: log.debug('No fuzzy first name match for {f1} and {f2}: {fuzzy}' .format(f1=firstnames, f2=res_firstnames, fuzzy=fuzzy_firstname_match)) person['score'] = score if score > 200: filtered.append(person) log.info('Found matching Warsa person for {rank} {fn} {ln} {uri}: ' '{res_rank} {res_fn} {res_ln} {res_uri} [score: {score}]'. format(rank=rank, fn=s_first1, ln=lastname, uri=s, res_rank=res_ranks, res_fn=s_first2, res_ln=res_lastname, res_uri=res_id, score=score)) else: log.info('Skipping potential match because of too low score [{score}]: {p1} <<-->> {p2}'. format(p1=s, p2=res_id, score=score)) if len(filtered) == 1: return filtered elif len(filtered) > 1: log.warning('Found several matches for Warsa person {s} ({text}): {ids}'. format(s=s, text=text, ids=', '.join(p['properties'].get('id')[0].split('^')[0].replace('"', '') for p in filtered))) best_matches = sorted(filtered, key=lambda p: p['score'], reverse=True) log.warning('Choosing best match: {id}'.format(id=best_matches[0].get('id'))) return [best_matches[0]] return [] return _validate_name arpa = Arpa(endpoint) # if preprocessor is None: # preprocessor = _combine_rank_and_names # if validator is None: validator = _validator # Query the ARPA service and add the matches return arpafy(graph_data, target_prop, arpa, source_lastname_prop, preprocessor=preprocessor, progress=True, validator=validator, retry_amount=50)