def get_q_list(): # 202444 - личное имя # 12308941 - мужское # 11879590 - женское # 3409032 - юнисекс url = 'http://tools.wmflabs.org/wikidata-terminator/?list&lang=ru&mode=t1000&q=claim[31:202444,12308941,11879590,3409032]' url = 'http://tools.wmflabs.org/wikidata-terminator/?list&lang=ru&mode=t1000&q=claim[31:(claim[279:202444])]%20OR%20claim[31:202444]' response = urllib.request.urlopen(url) str_response = response.readall().decode('utf-8') q_list = re.findall("<tr><td><a href='//www.wikidata.org/wiki/(Q\d+?)'.*?<small>\((.+?)\)</small>", str_response) return q_list
def get_ru_labels(ids, q): # https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42|Q1&props=labels&languages=ru url_ru_labels = 'https://www.wikidata.org/w/api.php?format=json&action=wbgetentities&ids=' + ids + '&props=labels&languages=ru' response = urllib.request.urlopen(url_ru_labels) str_response = response.readall().decode('utf-8') item_json = json.loads(str_response) ru_labels = [] for ent in item_json["entities"]: try: ru_labels.append(item_json["entities"][ent]["labels"]["ru"]["value"]) except: '' try: existing_title = item_json["entities"][q]["labels"]["ru"]["value"] ru_labels.remove(existing_title) except: existing_title = '' return existing_title, ru_labels
print('--------') latin_title = qel[1] q = qel[0] print(q, latin_title.encode('cp1251', 'replace')) if " " in latin_title: continue # url = "http://www.wikidata.org/w/api.php?format=json&action=wbgetentities&ids="+q+"&props=labels&languages=ru|en" # http://wdq.wmflabs.org/api?q=claim[735:7451984]%20AND%20link[ruwiki] url_what_have_this_name = 'http://wdq.wmflabs.org/api?q=claim[735:' + q.replace('Q', '') + ']%20AND%20link[ruwiki]' response = urllib.request.urlopen(url_what_have_this_name) str_response = response.readall().decode('utf-8') item_json = json.loads(str_response) ids = q for i in item_json["items"]: ids += '|Q' + str(i) print(ids) existing_title, ru_labels = get_ru_labels(ids, q) print(existing_title.encode('cp1251', 'replace')) try: print(ru_labels) except: ''
def get_text_by_url(url): response = urllib.request.urlopen(url) return response.readall().decode('utf-8')