def _get_dbpedia_nl_record(self, dbpedia_nl_identifier): if self.debug: self.log.info("getting.. %s " % (dbpedia_nl_identifier)) if dbpedia_nl_identifier.lower().find("http") > -1: if dbpedia_nl_identifier.find('http:/') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split( '/')[-1].strip() elif dbpedia_nl_identifier.find('%2F') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.split( '%2F')[-1].strip() if dbpedia_nl_identifier.lower().startswith( "dbp:") or dbpedia_nl_identifier.lower().startswith( "dbpedia_nl:"): dbpedia_nl_identifier = dbpedia_nl_identifier.split(':')[0] if dbpedia_nl_identifier[0].islower(): dbpedia_nl_identifier = dbpedia_nl_identifier.title() if dbpedia_nl_identifier.find(' ') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace(" ", "_") if dbpedia_nl_identifier.find('%20') > -1: dbpedia_nl_identifier = dbpedia_nl_identifier.replace("%20", "_") if dbpedia_nl_identifier.find('+') > -1 or dbpedia_nl_identifier.find( '_') > -1: q = "" for item in dbpedia_nl_identifier.split('+'): q += "+OR+prefLabel:" + item for item in dbpedia_nl_identifier.split('_'): q += "+OR+prefLabel:" + item url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), q) else: url = self.DBPEDIA_NL_URL % (dbpedia_nl_identifier.strip(), "+OR+" + dbpedia_nl_identifier.strip()) data = self.get(url) if not data: if self.debug: self.log.info("No DBPedia_nl data for: %s @ %s (via %s)" % (dbpedia_nl_identifier, url, self.backend)) return (False) else: if type(data) == str: data = simplejson.loads(data) if data["response"]["numFound"] > 0: identifier = data["response"]["docs"][0]["id"].split('/')[-1] record = DBPedia([identifier], backend=self.backend, log_path=self.log_path, debug=self.debug) record.execute() self[record.keys()[0]] = record[record.keys()[0]] s = self[record.keys()[0]].pop("sameAs")[0] rec = sameAs([s], backend=self.backend, log_path=self.log_path, debug=self.debug) rec.execute() self[record.keys()[0]]["same"] = rec.values()[0]["same"]
def __init__(self): self.dbpedia = DBPedia() self.synonyms_base = SynonymsBase() self.tokenizer = Tokenizer()