def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace('-', '_') url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0" query = """ WITH entities_wo_wikidata AS ( SELECT DISTINCT AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON = l.""" + url_entity_point + """ AND u.url LIKE '' WHERE /* No existing WikiData relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = AND ol.""" + main_entity_entity_point + """ <> AND l.edits_pending = 0 ) SELECT, e.gid,, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN """ + entity_type_table + """ e ON ewf.entity_id = LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang ORDER BY b.processed NULLS FIRST, LIMIT 500 """ seen = set() matched = set() for entity in db.execute(query): if entity['gid'] in matched: continue colored_out(bcolors.OKBLUE, 'Looking up entity "%s"' % (entity['name'], ENTITY_TYPE, entity['gid'])) out(' * wiki:', entity['wp_url']) page = WikiPage.fetch(entity['wp_url'], False) if page.wikidata_id: wikidata_url = '' % page.wikidata_id.upper() edit_note = 'From %s' % (entity['wp_url'],) colored_out(bcolors.OKGREEN, ' * found WikiData identifier:', wikidata_url) time.sleep(1) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True) matched.add(entity['gid']) if entity['processed'] is None and entity['gid'] not in seen: db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity['gid'], page.lang)) else: db.execute("UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity['gid'], page.lang)) seen.add(entity['gid']) stats['seen'][ENTITY_TYPE] = len(seen) stats['matched'][ENTITY_TYPE] = len(matched)
def main(): seen = set() matched = set() for artist in db.execute(query): if artist['gid'] in matched: continue colored_out( bcolors.OKBLUE, 'Looking up artist "%s"' % (artist['name'], artist['gid'])) out(' * wiki:', artist['wp_url']) page = WikiPage.fetch(artist['wp_url'], False) identifiers = determine_authority_identifiers(page) if 'VIAF' in identifiers: if not isinstance(identifiers['VIAF'], basestring): colored_out( bcolors.FAIL, ' * multiple VIAF found: %s' % ', '.join(identifiers['VIAF'])) elif identifiers['VIAF'] == '' or identifiers['VIAF'] is None: colored_out(bcolors.FAIL, ' * invalid empty VIAF found') else: viaf_url = '' % identifiers['VIAF'] edit_note = 'From %s' % (artist['wp_url'], ) colored_out(bcolors.OKGREEN, ' * found VIAF:', viaf_url) # Check if this VIAF has not been deleted skip = False try: resp, content = httplib2.Http().request(viaf_url) except socket.error: colored_out(bcolors.FAIL, ' * timeout!') skip = True deleted_message = 'abandonedViafRecord' if skip == False and (resp.status == '404' or deleted_message in content): colored_out(bcolors.FAIL, ' * deleted VIAF!') skip = True if skip == False: time.sleep(3) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url('artist', artist['gid'], str(VIAF_RELATIONSHIP_TYPES['artist']), viaf_url, edit_note) matched.add(artist['gid']) if artist['processed'] is None and artist['gid'] not in seen: db.execute( "INSERT INTO bot_wp_artist_viaf (gid, lang) VALUES (%s, %s)", (artist['gid'], page.lang)) else: db.execute( "UPDATE bot_wp_artist_viaf SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], page.lang)) seen.add(artist['gid'])
def main(): seen = set() matched = set() for artist in db.execute(query): if artist['gid'] in matched: continue colored_out(bcolors.OKBLUE, 'Looking up artist "%s"' % (artist['name'], artist['gid'])) out(' * wiki:', artist['wp_url']) page = WikiPage.fetch(artist['wp_url'], False) identifiers = determine_authority_identifiers(page) if 'VIAF' in identifiers: if not isinstance(identifiers['VIAF'], basestring): colored_out(bcolors.FAIL, ' * multiple VIAF found: %s' % ', '.join(identifiers['VIAF'])) elif identifiers['VIAF'] == '' or identifiers['VIAF'] is None: colored_out(bcolors.FAIL, ' * invalid empty VIAF found') else: viaf_url = '' % identifiers['VIAF'] edit_note = 'From %s' % (artist['wp_url'],) colored_out(bcolors.OKGREEN, ' * found VIAF:', viaf_url) # Check if this VIAF has not been deleted skip = False try: resp, content = httplib2.Http().request(viaf_url) except socket.error: colored_out(bcolors.FAIL, ' * timeout!') skip = True deleted_message = 'abandonedViafRecord' if skip == False and (resp.status == '404' or deleted_message in content): colored_out(bcolors.FAIL, ' * deleted VIAF!') skip = True if skip == False: time.sleep(3) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url('artist', artist['gid'], str(VIAF_RELATIONSHIP_TYPES['artist']), viaf_url, edit_note) matched.add(artist['gid']) if artist['processed'] is None and artist['gid'] not in seen: db.execute("INSERT INTO bot_wp_artist_viaf (gid, lang) VALUES (%s, %s)", (artist['gid'], page.lang)) else: db.execute("UPDATE bot_wp_artist_viaf SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], page.lang)) seen.add(artist['gid'])
def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace('-', '_') url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0" query = """ WITH entities_wo_wikidata AS ( SELECT DISTINCT AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str( WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON = l.""" + url_entity_point + """ AND u.url ~ '^https?://[a-z-]+\.wikipedia\.org/wiki/' WHERE /* No existing Wikidata relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str( WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = AND ol.""" + main_entity_entity_point + """ <> AND l.edits_pending = 0 ) SELECT, e.gid,, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN """ + entity_type_table + """ e ON ewf.entity_id = LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang ORDER BY b.processed NULLS FIRST, LIMIT 500 """ seen = set() matched = set() for entity in db.execute(query): if entity['gid'] in matched: continue colored_out( bcolors.OKBLUE, 'Looking up entity "%s"' % (entity['name'], ENTITY_TYPE, entity['gid'])) out(' * wiki:', entity['wp_url']) page = WikiPage.fetch(entity['wp_url'], False) if page.wikidata_id: wikidata_url = '' % page.wikidata_id.upper( ) edit_note = 'From %s' % (entity['wp_url'], ) colored_out(bcolors.OKGREEN, ' * found Wikidata identifier:', wikidata_url) time.sleep(1) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True) matched.add(entity['gid']) if entity['processed'] is None and entity['gid'] not in seen: db.execute( "INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity['gid'], page.lang)) else: db.execute( "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity['gid'], page.lang)) seen.add(entity['gid']) stats['seen'][ENTITY_TYPE] = len(seen) stats['matched'][ENTITY_TYPE] = len(matched)
category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]') category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]') for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute(query, query_params): colored_out(bcolors.OKBLUE, 'Looking up release group "%s"' % (rg_name, rg_gid)) matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results last_wp_request = time.time() for match in matches: title = match['name'] if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(rg_name) and mangle_name(title) != mangle_name(rg_name): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() wikipage = WikiPage.fetch('' % (wp_lang, title)) page_orig = wikipage.text if not page_orig: continue page_title = title colored_out(bcolors.HEADER, ' * trying article %s' % (title,)) page = mangle_name(page_orig) is_canonical, reason = wp_is_canonical_page(title, page_orig) if (not is_canonical): out(' * %s, skipping' % reason) continue categories = category_re[wp_lang].findall(page_orig) is_album_page = False for category in categories:
qf='name', rows=50).results last_wp_request = time.time() for match in matches: title = match['name'] if title.endswith('album)') or title.endswith('song)'): continue if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name( artist['name']) and mangle_name(title) != mangle_name( artist['name']): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() wikipage = WikiPage.fetch('' % (wp_lang, title)) page_orig = wikipage.text if not page_orig: continue out(' * trying article "%s"' % (title, )) page = mangle_name(page_orig) is_canonical, reason = wp_is_canonical_page(title, page_orig) if (not is_canonical): out(' * %s, skipping' % reason) continue if 'infoboxalbum' in page: out(' * album page, skipping') continue page_title = title
def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace("-", "_") url_relationship_table = "l_%s_url" % entity_type_table if ENTITY_TYPE != "work" else "l_url_%s" % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != "work" else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != "work" else "entity0" query = ( """ WITH entities_wo_wikidata AS ( SELECT DISTINCT AS entity_id, e.gid AS entity_gid, u.url AS wp_url FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON = l.""" + url_entity_point + """ AND u.url LIKE '' AND substring(u.url from 8 for 2) IN ('en', 'fr') WHERE /* No existing WikiData relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = AND IN (SELECT id FROM link WHERE link_type = """ + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = AND ol.""" + main_entity_entity_point + """ <> AND l.edits_pending = 0 ) SELECT, e.gid,, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN s_""" + entity_type_table + """ e ON ewf.entity_id = LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = substring(ewf.wp_url from 8 for 2) ORDER BY b.processed NULLS FIRST, LIMIT 250 """ ) seen = set() matched = set() for entity in db.execute(query): if entity["gid"] in matched: continue colored_out( bcolors.OKBLUE, 'Looking up entity "%s"' % (entity["name"], ENTITY_TYPE, entity["gid"]), ) out(" * wiki:", entity["wp_url"]) page = WikiPage.fetch(entity["wp_url"], False) if page.wikidata_id: wikidata_url = "" % page.wikidata_id.upper() edit_note = "From %s" % (entity["wp_url"],) colored_out(bcolors.OKGREEN, " * found WikiData identifier:", wikidata_url) time.sleep(3) out(" * edit note:", edit_note.replace("\n", " ")) mb.add_url( ENTITY_TYPE.replace("-", "_"), entity["gid"], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True, ) matched.add(entity["gid"]) if entity["processed"] is None and entity["gid"] not in seen: db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity["gid"], page.lang)) else: db.execute( "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity["gid"], page.lang), ) seen.add(entity["gid"])
def main(): seen = set() for artist in db.execute(query): if artist['id'] in seen: continue seen.add(artist['id']) colored_out( bcolors.OKBLUE, 'Looking up artist "%s"' % (artist['name'], artist['gid'])) out(' * wiki:', artist['url']) artist = dict(artist) update = set() reasons = [] page = WikiPage.fetch(artist['url'], False) if not artist['area']: country, country_reasons = determine_country(page) if country: country_id = country_ids[country] artist['area'] = country_id update.add('area') reasons.append(('COUNTRY', country_reasons)) if not artist['type']: type, type_reasons = determine_type(page) if type: type_id = artist_type_ids[type] artist['type'] = type_id update.add('type') reasons.append(('TYPE', type_reasons)) if not artist['gender'] and artist['type'] == 1: gender, gender_reasons = determine_gender(page) if gender: gender_id = gender_ids[gender] artist['gender'] = gender_id update.add('gender') reasons.append(('GENDER', gender_reasons)) is_performance_name = False if artist['type'] == 1 and CHECK_PERFORMANCE_NAME: is_performance_name = db.execute(performance_name_query, artist['id']).scalar() > 0 out(" * checking for performance name", is_performance_name) if not artist['begin_date_year']: begin_date, begin_date_reasons = determine_begin_date( artist, page, is_performance_name) if begin_date['year']: colored_out(bcolors.OKGREEN, " * new begin date:", begin_date) artist['begin_date_year'] = begin_date['year'] artist['begin_date_month'] = begin_date['month'] artist['begin_date_day'] = begin_date['day'] update.add('begin_date') reasons.append(('BEGIN DATE', begin_date_reasons)) if not artist['end_date_year']: end_date, end_date_reasons = determine_end_date( artist, page, is_performance_name) if end_date['year']: colored_out(bcolors.OKGREEN, " * new end date:", end_date) artist['end_date_year'] = end_date['year'] artist['end_date_month'] = end_date['month'] artist['end_date_day'] = end_date['day'] update.add('end_date') reasons.append(('END DATE', end_date_reasons)) if update: edit_note = 'From %s' % (artist['url'], ) for field, reason in reasons: edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason)) out(' * edit note:', edit_note.replace('\n', ' ')) time.sleep(10) mb.edit_artist(artist, update, edit_note) if artist['processed'] is None: db.execute( "INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute( "UPDATE bot_wp_artist_data SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
def main(): seen = set() for artist in db.execute(query): if artist['id'] in seen: continue seen.add(artist['id']) colored_out(bcolors.OKBLUE, 'Looking up artist "%s"' % (artist['name'], artist['gid'])) out(' * wiki:', artist['url']) artist = dict(artist) update = set() reasons = [] page = WikiPage.fetch(artist['url']) if not artist['country']: country, country_reasons = determine_country(page) if country: country_id = country_ids[country] artist['country'] = country_id update.add('country') reasons.append(('COUNTRY', country_reasons)) if not artist['type']: type, type_reasons = determine_type(page) if type: type_id = artist_type_ids[type] artist['type'] = type_id update.add('type') reasons.append(('TYPE', type_reasons)) if not artist['gender'] and artist['type'] == 1: gender, gender_reasons = determine_gender(page) if gender: gender_id = gender_ids[gender] artist['gender'] = gender_id update.add('gender') reasons.append(('GENDER', gender_reasons)) is_performance_name = False if artist['type'] == 1 and CHECK_PERFORMANCE_NAME: is_performance_name = db.execute(performance_name_query, artist['id']).scalar() > 0 out(" * checking for performance name", is_performance_name) if not artist['begin_date_year']: begin_date, begin_date_reasons = determine_begin_date(artist, page, is_performance_name) if begin_date['year']: colored_out(bcolors.OKGREEN, " * new begin date:", begin_date) artist['begin_date_year'] = begin_date['year'] artist['begin_date_month'] = begin_date['month'] artist['begin_date_day'] = begin_date['day'] update.add('begin_date') reasons.append(('BEGIN DATE', begin_date_reasons)) if not artist['end_date_year']: end_date, end_date_reasons = determine_end_date(artist, page, is_performance_name) if end_date['year']: colored_out(bcolors.OKGREEN, " * new end date:", end_date) artist['end_date_year'] = end_date['year'] artist['end_date_month'] = end_date['month'] artist['end_date_day'] = end_date['day'] update.add('end_date') reasons.append(('END DATE', end_date_reasons)) if update: edit_note = 'From %s' % (artist['url'],) for field, reason in reasons: edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason)) out(' * edit note:', edit_note.replace('\n', ' ')) time.sleep(10) mb.edit_artist(artist, update, edit_note) db.execute("INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) out()