def determine_type_from_page(page): types = set() reasons = [] background_field = infobox_fields["background"][page.lang] background = page.infobox.get(background_field, "") if background == "solo_singer" or background == "vocal" or background == "instrumentiste": types.add("person") reasons.append('Infobox has "' + background_field + " = " + background + '".') if page.persondata.get("name"): types.add("person") reasons.append('Contains the "Persondata" infobox.') if background == "group_or_band" or background == "groupe": types.add("group") reasons.append('Infobox has "' + background_field + " = " + background + '".') relevant_categories = [] for category in page.categories: if page.lang == "fr": if category.startswith("Groupe"): types.add("group") relevant_categories.append(category) else: if category.endswith("groups") or category.startswith("Musical groups"): types.add("group") relevant_categories.append(category) if relevant_categories: reasons.append("Belongs to %s." % join_names("category", relevant_categories)) return types, " ".join(reasons)
def determine_type_from_page(page): types = set() reasons = [] background_field = infobox_fields['background'][page.lang] background = page.infobox.get(background_field, '') if background == 'solo_singer' or background == 'vocal' or background == 'instrumentiste': types.add('person') reasons.append('Infobox has "'+background_field+' = '+background+'".') if page.persondata.get('name'): types.add('person') reasons.append('Contains the "Persondata" infobox.') if background == 'group_or_band' or background == 'groupe': types.add('group') reasons.append('Infobox has "'+background_field+' = '+background+'".') relevant_categories = [] for category in page.categories: if page.lang == 'fr': if category.startswith('Groupe'): types.add('group') relevant_categories.append(category) else: if category.endswith('groups') or category.startswith('Musical groups'): types.add('group') relevant_categories.append(category) if relevant_categories: reasons.append('Belongs to %s.' % join_names('category', relevant_categories)) return types, ' '.join(reasons)
def determine_country_from_text(page): countries = set() relevant_links = [] find_countries_in_text(countries, relevant_links, page.abstract, page.lang) reason = 'The first paragraph links to %s.' % join_names( '', relevant_links) return countries, reason
def determine_country_from_infobox(infobox): countries = set() relevant_links = [] for field in ['origin', 'born', 'birth_place']: text = infobox.get(field, '') find_countries_in_text(countries, relevant_links, text) reason = 'Infobox links to %s.' % join_names('', relevant_links) return countries, reason
def determine_country_from_infobox(page): countries = set() relevant_links = [] for field in infobox_fields['country'][page.lang]: field = field.decode('utf8') text = page.infobox.get(field, '') #if len(text) > 0: # out("Text from infobox (field=%s): %s" % (field, text)) find_countries_in_text(countries, relevant_links, text, page.lang) reason = 'Infobox links to %s.' % join_names('', relevant_links) return countries, reason
def determine_gender_from_categories(categories): genders = set() relevant_categories = [] for category in categories: if re.search(r'\bmale\b', category, re.I): genders.add('male') relevant_categories.append(category) if re.search(r'\bfemale\b', category, re.I): genders.add('female') relevant_categories.append(category) reason = 'Belongs to %s.' % join_names('category', relevant_categories) return genders, reason
def determine_country_from_categories(categories): countries = set() relevant_categories = [] for category in categories: category = category.replace('_', ' ') for name, code in category_countries.iteritems(): if category.startswith(name + ' '): countries.add(code) relevant_categories.append(category) for name in link_us_states: if category.endswith('from ' + name): countries.add('US') relevant_categories.append(category) reason = 'Belongs to %s.' % join_names('category', relevant_categories) return countries, reason, len(relevant_categories)
def determine_country_from_categories(page): countries = set() relevant_categories = [] for category in page.categories: category = category.replace('_', ' ') for name, code in demonyms[page.lang].iteritems(): if name.decode('utf8') in category: countries.add(code) relevant_categories.append(category) for name in wp_us_states_links: if category.endswith('from ' + name): countries.add('US') relevant_categories.append(category) reason = 'Belongs to %s.' % join_names('category', relevant_categories) return countries, reason, len(relevant_categories)
def determine_country_from_categories(page): countries = set() relevant_categories = [] for category in page.categories: category = category.replace("_", " ") for name, code in demonyms[page.lang].iteritems(): if name.decode("utf8") in category: countries.add(code) relevant_categories.append(category) for name in wp_us_states_links: if category.endswith("from " + name): countries.add("US") relevant_categories.append(category) reason = "Belongs to %s." % join_names("category", relevant_categories) return countries, reason, len(relevant_categories)
def determine_gender_from_categories(page): genders = set() relevant_categories = [] for category in page.categories: if re.search(r"\bmale\b", category, re.I): genders.add("male") relevant_categories.append(category) if re.search(r"\bfemale\b", category, re.I): genders.add("female") relevant_categories.append(category) if re.search(r"^(Chanteur|Acteur|Animateur)\b", category, re.I): genders.add("male") relevant_categories.append(category) if re.search(r"^(Chanteuse|Actrice|Animatrice)\b", category, re.I): genders.add("female") relevant_categories.append(category) reason = "Belongs to %s." % join_names("category", relevant_categories) return genders, reason
def determine_gender_from_categories(page): genders = set() relevant_categories = [] for category in page.categories: if re.search(r'\bmale\b', category, re.I): genders.add('male') relevant_categories.append(category) if re.search(r'\bfemale\b', category, re.I): genders.add('female') relevant_categories.append(category) if re.search(r'^(Chanteur|Acteur|Animateur)\b', category, re.I): genders.add('male') relevant_categories.append(category) if re.search(r'^(Chanteuse|Actrice|Animatrice)\b', category, re.I): genders.add('female') relevant_categories.append(category) reason = 'Belongs to %s.' % join_names('category', relevant_categories) return genders, reason
def determine_type_from_page(page): types = set() reasons = [] background = page.infobox.get('background', '') if background == 'solo_singer': types.add('person') reasons.append('Infobox has "background = solo_singer".') if page.persondata.get('name'): types.add('person') reasons.append('Contains the "Persondata" infobox.') if background == 'group_or_band': types.add('group') reasons.append('Infobox has "background = group_or_band".') relevant_categories = [] for category in page.categories: if category.endswith('groups') or category.startswith('Musical groups'): types.add('group') relevant_categories.append(category) if relevant_categories: reasons.append('Belongs to %s.' % join_names('category', relevant_categories)) return types, ' '.join(reasons)
if "disambiguationpages" in page: print " * disambiguation or album page, skipping" continue if "recordlabels" not in page: print " * not a record label page, skipping" continue page_title = pages[0]["title"] print ' * trying article "%s"' % (page_title,) artists = set([r[0] for r in db.execute(query_label_artists, (id,))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print " * ratio: %s, has artists: %s, found artists: %s" % (ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = "https://en.wikipedia.org/wiki/%s" % (quote_page_title(page_title),) text = "Matched based on the name. The page mentions %s." % (join_names("artist", found_artists),) print " * linking to %s" % (url,) print " * edit note: %s" % (text,) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))
# Examine albums found_albums = [] albums = set([r[0] for r in db.execute(query_artist_albums, (artist['id'],) * 2)]) albums_to_ignore = set() for album in albums: if mangle_name(artist['name']) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 6 and mangled_album in page: found_albums.append(album) if (found_albums): reasons.append(join_names('album', found_albums)) out(' * has albums: %s, found albums: %s' % (len(albums), len(found_albums))) # Examine works found_works = [] page = mangle_name(page_orig) works = set([r[0] for r in db.execute(query_artist_works, (artist['id'],) * 2)]) for work in works: mangled_work = mangle_name(work) if mangled_work in page: found_works.append(work) if (found_works): reasons.append(join_names('work', found_works)) out(' * has works: %s, found works: %s' % (len(works), len(found_works))) # Examine urls
albums = set([ r[0] for r in db.execute(query_artist_albums, (artist['id'], ) * 2) ]) albums_to_ignore = set() for album in albums: if mangle_name(artist['name']) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 6 and mangled_album in page: found_albums.append(album) if (found_albums): reasons.append(join_names('album', found_albums)) out(' * has albums: %s, found albums: %s' % (len(albums), len(found_albums))) # Examine works found_works = [] page = mangle_name(page_orig) works = set([ r[0] for r in db.execute(query_artist_works, (artist['id'], ) * 2) ]) for work in works: mangled_work = mangle_name(work) if mangled_work in page: found_works.append(work) if (found_works): reasons.append(join_names('work', found_works))
print ' * disambiguation or album page, skipping' continue if 'recordlabels' not in page: print ' * not a record label page, skipping' continue page_title = pages[0]['title'] print ' * trying article "%s"' % (page_title,) artists = set([r[0] for r in db.execute(query_label_artists, (id,))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print ' * ratio: %s, has artists: %s, found artists: %s' % (ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = 'http://en.wikipedia.org/wiki/%s' % (urllib.quote(page_title.encode('utf8').replace(' ', '_')),) text = 'Matched based on the name. The page mentions %s.' % (join_names('artist', found_artists),) print ' * linking to %s' % (url,) print ' * edit note: %s' % (text,) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))
if 'recordlabels' not in page: print ' * not a record label page, skipping' continue page_title = pages[0]['title'] print ' * trying article "%s"' % (page_title, ) artists = set([r[0] for r in db.execute(query_label_artists, (id, ))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print ' * ratio: %s, has artists: %s, found artists: %s' % ( ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = 'https://en.wikipedia.org/wiki/%s' % ( quote_page_title(page_title), ) text = 'Matched based on the name. The page mentions %s.' % ( join_names('artist', found_artists), ) print ' * linking to %s' % (url, ) print ' * edit note: %s' % (text, ) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid, ))
page_title = pages[0]["title"] found_albums = [] albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))]) albums_to_ignore = set() for album in albums: if mangle_name(name) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 4 and mangled_album in page: found_albums.append(album) ratio = len(found_albums) * 1.0 / len(albums) print " * ratio: %s, has albums: %s, found albums: %s" % (ratio, len(albums), len(found_albums)) min_ratio = 0.2 if len(found_albums) < 2: continue if ratio < min_ratio: continue url = "http://ja.wikipedia.org/wiki/%s" % (quote_page_title(page_title),) text = "Matched based on the name. The page mentions %s." % (join_names("album", found_albums),) print " * linking to %s" % (url,) print " * edit note: %s" % (text,) mb.add_url("artist", gid, 179, url, text) break db.execute("INSERT INTO bot_wp_artist_ja (gid) VALUES (%s)", (gid,)) print processed, skipped
tracks = set([r[0] for r in db.execute(query_album_tracks, (rg_id,))]) tracks_to_ignore = set() for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track): tracks_to_ignore.add(track) tracks -= tracks_to_ignore if len(tracks) < 5: continue for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.7 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) text = 'Matched based on the name. The page mentions artist "%s" and %s.' % (ac_name, join_names('track', found_tracks),) colored_out(bcolors.OKGREEN, ' * linking to %s' % (url,)) out(' * edit note: %s' % (text,)) time.sleep(5) mb.add_url("release_group", rg_gid, 89, url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.7 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) text = 'Matched based on the name. The page mentions artist "%s" and %s.' % ( ac_name, join_names('track', found_tracks), ) colored_out(bcolors.OKGREEN, ' * linking to %s' % (url, )) out(' * edit note: %s' % (text, )) time.sleep(5) mb.add_url("release_group", rg_gid, 89, url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute( "UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
found_albums = [] albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))]) albums_to_ignore = set() for album in albums: if mangle_name(name) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 4 and mangled_album in page: found_albums.append(album) ratio = len(found_albums) * 1.0 / len(albums) print ' * ratio: %s, has albums: %s, found albums: %s' % (ratio, len(albums), len(found_albums)) min_ratio = 0.2 if len(found_albums) < 2: continue #if ratio < min_ratio: # continue url = 'http://ko.wikipedia.org/wiki/%s' % (quote_page_title(page_title),) text = 'Matched based on the name. The page mentions %s.' % (join_names('album', found_albums),) print ' * linking to %s' % (url,) print ' * edit note: %s' % (text,) mb.add_url("artist", gid, 179, url, text) break db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid,)) print processed, skipped
print ' * disambiguation or album page, skipping' continue if 'recordlabels' not in page: print ' * not a record label page, skipping' continue page_title = pages[0]['title'] print ' * trying article "%s"' % (page_title,) artists = set([r[0] for r in db.execute(query_label_artists, (id,))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print ' * ratio: %s, has artists: %s, found artists: %s' % (ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = 'http://en.wikipedia.org/wiki/%s' % (quote_page_title(page_title),) text = 'Matched based on the name. The page mentions %s.' % (join_names('artist', found_artists),) print ' * linking to %s' % (url,) print ' * edit note: %s' % (text,) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))
albums_to_ignore = set() for album in albums: if mangle_name(name) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 4 and mangled_album in page: found_albums.append(album) ratio = len(found_albums) * 1.0 / len(albums) print ' * ratio: %s, has albums: %s, found albums: %s' % ( ratio, len(albums), len(found_albums)) min_ratio = 0.2 if len(found_albums) < 2: continue #if ratio < min_ratio: # continue url = 'https://ko.wikipedia.org/wiki/%s' % ( quote_page_title(page_title), ) text = 'Matched based on the name. The page mentions %s.' % ( join_names('album', found_albums), ) print ' * linking to %s' % (url, ) print ' * edit note: %s' % (text, ) mb.add_url("artist", gid, 179, url, text) break db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid, )) print processed, skipped
colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) wp_url = 'https://%s.wikipedia.org/wiki/%s' % ( wp_lang, quote_page_title(page_title), ) wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper( ) text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions artist "%s" and %s.' % ( wp_url, ac_name, join_names('track', found_tracks), ) colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url, )) out(' * edit note: %s' % (text, )) time.sleep(5) mb.add_url("release_group", rg_gid, 353, wd_url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute( "UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
mangled_track = mangle_name(track) if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track): tracks_to_ignore.add(track) tracks -= tracks_to_ignore if len(tracks) < 5: continue for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.7 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) wp_url = 'https://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),) wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper() text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions artist "%s" and %s.' % (wp_url, ac_name, join_names('track', found_tracks),) colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url,)) out(' * edit note: %s' % (text,)) time.sleep(5) mb.add_url("release_group", rg_gid, 353, wd_url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
def determine_country_from_text(page): countries = set() relevant_links = [] find_countries_in_text(countries, relevant_links, page.abstract) reason = 'The first paragraph links to %s.' % join_names('', relevant_links) return countries, reason