def _get_dbpedia_nl_record(self, dbpedia_nl_identifier): if self.debug: self.log.info("getting.. %s " % (dbpedia_nl_identifier)) if dbpedia_nl_identifier.lower().find("http") > -1: if dbpedia_nl_identifier.find('http:/') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split( '/')[-1].strip() elif dbpedia_nl_identifier.find('%2F') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split( '%2F')[-1].strip() if dbpedia_nl_identifier.lower().startswith( "dbp:") or dbpedia_nl_identifier.lower().startswith( "dbpedia_nl:"): dbpedia_nl_identifier = dbpedia_nl_identifier.split(':')[0] if dbpedia_nl_identifier[0].islower(): dbpedia_nl_identifier = dbpedia_nl_identifier.title() if dbpedia_nl_identifier.find(' ') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace(" ", "_") if dbpedia_nl_identifier.find('%20') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace("%20", "_") if dbpedia_nl_identifier.find('+') > -1 or dbpedia_nl_identifier.find( '_') > -1: q = "" for item in dbpedia_nl_identifier.split('+'): q += "+OR+prefLabel:" + item for item in dbpedia_nl_identifier.split('_'): q += "+OR+prefLabel:" + item url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), q) else: url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), "+OR+" + dbpedia_nl_identifier.strip()) data = self.get(url) if not data: if self.debug: self.log.info("No DBPedia_nl data for: %s @ %s (via %s)" % (dbpedia_nl_identifier, url, self.backend)) return (False) else: if type(data) == str: data = simplejson.loads(data) if data["response"]["numFound"] > 0: identifier = data["response"]["docs"][0]["id"].split('/')[-1] record = DBPedia([identifier], backend=self.backend, log_path=self.log_path, debug=self.debug) record.execute() self[record.keys()[0]] = record[record.keys()[0]] s = self[record.keys()[0]].pop("sameAs")[0] rec = sameAs([s], backend=self.backend, log_path=self.log_path, debug=self.debug) rec.execute() self[record.keys()[0]]["same"] = rec.values()[0]["same"]
class PropertyMatcher(object): def __init__(self): self.dbpedia = DBPedia() self.synonyms_base = SynonymsBase() self.tokenizer = Tokenizer() def match_tokens_with_properties(self, resource, tokens): properties = self.dbpedia.get_properties(resource) word_values = defaultdict(list) filtered_tokens = self._filter_tokens(tokens, resource) for word in filtered_tokens: if word.lower() in resource.lower(): continue synonyms = self.synonyms_base.find_synonyms(word, tokens) for synonym in self.tokenizer.clean_entities(synonyms): for prop in properties: if prop == 'sameAs' or prop == 'wasDerivedFrom': continue if self._word_matches_property(synonym, prop): values = self.dbpedia.get_value(resource, prop) word_values[word].append((synonym, prop, values)) confidence = self._calculate_confidence(filtered_tokens, word_values) return word_values, confidence def _word_matches_property(self, word, property): return word in property.lower() # + levenshtein? def _calculate_confidence(self, tokens, word_values): wh_q = any([t.lower() in ['when', 'where'] for t in tokens]) birth_death_q = any( [t.lower() in ['born', 'birth', 'die', 'died'] for t in tokens]) if wh_q and birth_death_q: return 0.9 if len(tokens) == 0: return 1.0 return len(word_values.keys()) / len(tokens) def _filter_tokens(self, tokens, resource): return [ w for w in self.tokenizer.clean_entities(tokens) if w.lower() not in resource.lower() ]
def fetch_missing_data_from_dbpedia(db, filename): info('Fetching DBPedia data') processed = 0 for zone in db.find({ 'wikipedia': {'$exists': True, '$ne': None}, '$or': [ {'population': None}, {'population': {'$exists': False}}, {'area': None}, {'area': {'$exists': False}}, ] }, no_cursor_timeout=True): dbpedia = DBPedia(zone['wikipedia']) metadata = { 'dbpedia': dbpedia.resource_url, } metadata.update(dbpedia.fetch_population_or_area()) metadata.update(dbpedia.fetch_flag_or_blazon()) if db.find_one_and_update({'_id': zone['_id']}, {'$set': metadata}): processed += 1 success('Fetched DBPedia data for {0} zones'.format(processed))
def _get_dbpedia_nl_record(self, dbpedia_nl_identifier): if self.debug: self.log.info("getting.. %s " % (dbpedia_nl_identifier)) if dbpedia_nl_identifier.lower().find("http") > -1: if dbpedia_nl_identifier.find('http:/') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split('/')[-1].strip() elif dbpedia_nl_identifier.find('%2F') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split('%2F')[-1].strip() if dbpedia_nl_identifier.lower().startswith("dbp:") or dbpedia_nl_identifier.lower().startswith("dbpedia_nl:"): dbpedia_nl_identifier = dbpedia_nl_identifier.split(':')[0] if dbpedia_nl_identifier[0].islower(): dbpedia_nl_identifier = dbpedia_nl_identifier.title() if dbpedia_nl_identifier.find(' ') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace(" ","_") if dbpedia_nl_identifier.find('%20') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace("%20","_") if dbpedia_nl_identifier.find('+') > -1 or dbpedia_nl_identifier.find('_') > -1: q="" for item in dbpedia_nl_identifier.split('+'): q+="+OR+prefLabel:"+item for item in dbpedia_nl_identifier.split('_'): q+="+OR+prefLabel:"+item url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip() ,q) else: url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), "+OR+"+dbpedia_nl_identifier.strip()) data = self.get(url) if not data: if self.debug: self.log.info("No DBPedia_nl data for: %s @ %s (via %s)" % (dbpedia_nl_identifier, url, self.backend)) return(False) else: if type(data) == str: data=json.loads(data) if data["response"]["numFound"] > 0: identifier = data["response"]["docs"][0]["id"].split('/')[-1] record = DBPedia([identifier.replace('json', 'jsond')], backend=self.backend, log_path=self.log_path, debug=self.debug) record.execute() self[record.keys()[0]] = record[record.keys()[0]] s=self[record.keys()[0]].pop("sameAs")[0] rec = sameAs([s], backend=self.backend, log_path=self.log_path, debug=self.debug) rec.execute() self[record.keys()[0]]["same"] = rec.values()[0]["same"]
def __init__(self): self.dbpedia = DBPedia() self.synonyms_base = SynonymsBase() self.tokenizer = Tokenizer()