def get_country_code(self, name): name = helpers.strip_spaces(name).lower() # convert names from one form to another conversions = { "america": "unitedstatesofamerica", "unitedstates": "unitedstatesofamerica", "usa": "unitedstatesofamerica", "us": "unitedstatesofamerica", "russianfederation": "russia", "laopdr": "laos", "laopeople'sdemocraticrepublic": "laos", "côted'ivoire": "ivorycoast", "czechia": "czechrepublic", "caboverde": "capeverde", "timor-leste": "easttimor", "uae": "unitedarabemirates", "macao": "macau" } if name.find("taiwan") != -1: name = "taiwan" if name in conversions: name = conversions[name] try: return 'wd:' + self.countries[name] except KeyError as e: p = process.extractOne(name, list(self.countries.keys())) if p[1] > 85: key = p[0] return 'wd:' + self.countries[key] print(name) return None
def get_graph_spec(info): url_raw, metasource = info[0], info[1] q = '' if helpers.is_bad(url_raw): print(url_raw) return q if url_raw.find('.') == -1: return q url = '<http://' + urllib.parse.quote(url_raw) + '>' url_item = '<http://' + urllib.parse.quote(url_raw) + '/item>' graph = """ GRAPH """ + url ms = helpers.strip_spaces(metasource) q = "INSERT {" + graph + "{" + url_item + "wnp:metasource wni:" + ms + """}} WHERE {FILTER (EXISTS {""" + graph + """{?s ?p ?o} } && NOT EXISTS {""" + graph + "{ ?item wnp:metasource wni:" + ms + "}})};" return q
def get_graph_spec(source): q = '' if helpers.is_bad(source[1]): print(source[1]) return q if source[1].find('.') == -1: return q url = '<http://' + urllib.parse.quote(source[1]) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url #url q += ("INSERT { " + graph + " {" + url_item + " wdt:P1896 \'" + urllib.parse.quote(source[1]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1896 ?url}})} ;" ) #country if not helpers.is_bad(source[0]): country_code = get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) q += (" INSERT { " + graph + " {" + url_item + " wdt:P17 \'" + c + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P17 ?country}})} ;" ) #title if not helpers.is_bad(source[2]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean(source[2]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title}})} ;" ) #language if not helpers.is_bad(source[3]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean(source[3]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P37 ?lang}})} ;" ) #type if not helpers.is_bad(source[4]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean(source[4]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P31 ?type}})} ;" ) #title (native language) if not helpers.is_bad(source[5]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean(source[5]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title_native}})} ;" ) #paywall if not helpers.is_bad(source[6]): q += (" INSERT { " + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean(source[6]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:paywalled ?pw}})} ;" ) #metasource if not helpers.is_bad(source[7]): q += (" INSERT { " + graph + " {" + url_item + " wnp:metasource wni:" + helpers.strip_spaces(source[7]).lower() + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:metasource ?ms}})} ;" ) #state if not helpers.is_bad(source[8]): q += (" INSERT { " + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean(source[8]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P131 ?state}})} ;" ) #wikipedia name if not helpers.is_bad(source[10]): q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean(source[10]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-name ?wp_name}})} ;" ) #redirects? if not helpers.is_bad(source[11]): q += (" INSERT { " + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean(source[11]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:redirect ?rd}})} ;" ) #wikipedia link if not helpers.is_bad(source[12]): q += (" INSERT { " + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean(source[12]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-page ?wp_page}})} ;" ) return q
def get_country_code(name): try: return 'wd:'+ countries[helpers.strip_spaces(name).lower()] except KeyError as e: return("\'TODO\'") print(e)
def overwrite(self, source): q = '' # check for a valid url if helpers.is_bad(source[1]) or source[1].find('.') == -1: return q # add url to graph url = '<http://' + urllib.parse.quote(source[1].replace( "http://", "").replace("https://", "")) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url # add url match = "{" + graph + "{ ?item wdt:P1896 ?url}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1896 " "" + url + """ }} WHERE {} ;""") # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) match = "{" + graph + "{ ?item wdt:P17 ?country}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c + """ }} WHERE {} ;""") # add title if not helpers.is_bad(source[2]): match = "{" + graph + "{ ?item wdt:P1448 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add language if not helpers.is_bad(source[3]): match = "{" + graph + "{ ?item wdt:P37 ?lang}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean_string(source[3]) + """\' }} WHERE {} ;""") # add type if not helpers.is_bad(source[4]): match = "{" + graph + "{ ?item wdt:P31 ?type}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean_string(source[4]) + """\' }} WHERE {} ;""") # add title (native language) if not helpers.is_bad(source[5]): match = "{" + graph + "{ ?item wdt:P1704 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean_string(source[5]) + """\'}} WHERE {} ;""") # add paywall if not helpers.is_bad(source[6]): match = "{" + graph + "{ ?item wnp:paywalled ?pw}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add metasources for ms in source[7]: if not helpers.is_bad(ms): match = "{" + graph + "{ ?item wnp:metasource ?ms}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:metasource wni:" + helpers.strip_spaces(ms).lower() + """ }} WHERE {} ;""") # add state if not helpers.is_bad(source[8]): match = "{" + graph + "{ ?item wdt:P131 ?state}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean_string(source[8]) + """\' }} WHERE {} ;""") # add wikipedia name if not helpers.is_bad(source[10]): match = "{" + graph + "{ ?item wnp:wikipedia-name ?wp_name}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean_string(source[10]) + """\' }} WHERE {} ;""") # add redirects? if not helpers.is_bad(source[11]): match = "{" + graph + "{ ?item wnp:redirect ?rd}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean_string(source[11]) + """\' }} WHERE {} ;""") # add wikipedia link if not helpers.is_bad(source[12]): match = "{" + graph + "{ ?item wnp:wikipedia-page ?wp_page}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean_string(source[12]) + """\' }} WHERE {} ;""") # add description try: if not helpers.is_bad(source[14]): match = "{" + graph + "{ ?item wnp:description ?desc}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:description \'" + helpers.clean_string(source[14]) + """\' }} WHERE {} ;""") except IndexError: None return q
def get_ms(self, metasources): q = '' for ms in metasources: q += """; wnp:metasource wni:""" + helpers.strip_spaces(ms).lower() return q
def no_overwrite(self, source): q = '' if helpers.is_bad(source[1]): print(source[1]) return q # this means our url is not valid if source[1].find('.') == -1: return q # begin constructing graph spec # construct item item = '<http://' + urllib.parse.quote(source[1]) + '/item>' # construct item URL url = '<http://' + urllib.parse.quote(source[1]) + '>' # construct graph value graph = """ GRAPH """ + url # add URL q += ("INSERT { " + graph + " {" + item + " wdt:P1896 " + url + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1896 ?url}})} ;") # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) q += (" INSERT { " + graph + " {" + item + " wdt:P17 \'" + c + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P17 ?country}})} ;") # add title title if not helpers.is_bad(source[2]): q += (" INSERT { " + graph + " {" + item + " wdt:P1448 \'" + helpers.clean(source[2]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title}})} ;") # add language if not helpers.is_bad(source[3]): q += (" INSERT { " + graph + " {" + item + " wdt:P37 \'" + helpers.clean(source[3]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P37 ?lang}})} ;") # add source type if not helpers.is_bad(source[4]): q += (" INSERT { " + graph + " {" + item + " wdt:P31 \'" + helpers.clean(source[4]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P31 ?type}})} ;") # add title in native language if not helpers.is_bad(source[5]): q += (" INSERT { " + graph + " {" + item + " wdt:P1704 \'" + helpers.clean(source[5]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P1448 ?title_native}})} ;") # add paywall (Yes or No) if not helpers.is_bad(source[6]): q += (" INSERT { " + graph + " {" + item + " wnp:paywalled \'" + helpers.clean(source[6]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:paywalled ?pw}})} ;") # add metasource if not helpers.is_bad(source[7]): q += (" INSERT { " + graph + " {" + item + " wnp:metasource wni:" + helpers.strip_spaces(source[7]).lower() + """ }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:metasource ?ms}})} ;") # add state if not helpers.is_bad(source[8]): q += (" INSERT { " + graph + " {" + item + " wdt:P131 \'" + helpers.clean(source[8]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wdt:P131 ?state}})} ;") # add wikipedia name if not helpers.is_bad(source[10]): q += (" INSERT { " + graph + " {" + item + " wnp:wikipedia-name \'" + helpers.clean(source[10]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-name ?wp_name}})} ;") # add redirect if not helpers.is_bad(source[11]): q += (" INSERT { " + graph + " {" + item + " wnp:redirect \'" + helpers.clean(source[11]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:redirect ?rd}})} ;") # add wikipedia link if not helpers.is_bad(source[12]): q += (" INSERT { " + graph + " {" + item + " wnp:wikipedia-page \'" + helpers.clean(source[12]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:wikipedia-page ?wp_page}})} ;") # add description if not helpers.is_bad(source[14]): q += (" INSERT { " + graph + " {" + item + " wnp:description \'" + helpers.clean(source[14]) + """\' }} WHERE {FILTER (NOT EXISTS {""" + graph + "{ ?item wnp:description ?desc}})} ;") return q