def submit_cover_art(release, url, types): if already_processed(release, url): colored_out(bcolors.NONE, " * skipping already submitted image '%s'" % (url, )) else: colored_out( bcolors.OKGREEN, " * Adding " + ",".join(types) + (" " if len(types) > 0 else "") + "cover art '%s'" % (url, )) if 'discogs' in url: headers = { 'user-agent': 'MusicBrainzBot/0.1 +https://github.com/murdos/musicbrainz-bot' } resp, content = discogs_oauth_client.request(url, 'GET', headers=headers) else: content = urllib2.urlopen(url).read() image_file = tempfile.NamedTemporaryFile(delete=False) image_file.write(content) image_file.close() im = Image.open(image_file.name) edit_note = "'''Dimension''': %sx%s\n'''Source''': %s" % ( im.size[0], im.size[1], url) time.sleep(5) mb.add_cover_art(release, image_file.name, types, None, u'', edit_note, False) os.remove(image_file.name) save_processed(release, url)
def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace('-', '_') url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0" query = """ WITH entities_wo_wikidata AS ( SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON u.id = l.""" + url_entity_point + """ AND u.url LIKE 'http://%%.wikipedia.org/wiki/%%' WHERE /* No existing WikiData relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = u.id AND ol.""" + main_entity_entity_point + """ <> e.id) AND l.edits_pending = 0 ) SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN """ + entity_type_table + """ e ON ewf.entity_id = e.id LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang ORDER BY b.processed NULLS FIRST, e.id LIMIT 500 """ seen = set() matched = set() for entity in db.execute(query): if entity['gid'] in matched: continue colored_out(bcolors.OKBLUE, 'Looking up entity "%s" http://musicbrainz.org/%s/%s' % (entity['name'], ENTITY_TYPE, entity['gid'])) out(' * wiki:', entity['wp_url']) page = WikiPage.fetch(entity['wp_url'], False) if page.wikidata_id: wikidata_url = 'http://www.wikidata.org/wiki/%s' % page.wikidata_id.upper() edit_note = 'From %s' % (entity['wp_url'],) colored_out(bcolors.OKGREEN, ' * found WikiData identifier:', wikidata_url) time.sleep(1) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True) matched.add(entity['gid']) if entity['processed'] is None and entity['gid'] not in seen: db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity['gid'], page.lang)) else: db.execute("UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity['gid'], page.lang)) seen.add(entity['gid']) stats['seen'][ENTITY_TYPE] = len(seen) stats['matched'][ENTITY_TYPE] = len(matched)
def submit_cover_art(release, url, types): if already_processed(release, url): colored_out(bcolors.NONE, " * skipping already submitted image '%s'" % (url,)) else: colored_out(bcolors.OKGREEN, " * Adding " + ",".join(types) + (" " if len(types)>0 else "") + "cover art '%s'" % (url,)) img_file = urllib2.urlopen(url) im = Image.open(StringIO(img_file.read())) edit_note = "'''Dimension''': %sx%s\n'''Source''': %s" % (im.size[0], im.size[1], url) time.sleep(5) mb.add_cover_art(release, url, types, None, u'', edit_note, False) save_processed(release, url)
def submit_cover_art(release, url, types): if already_processed(release, url): colored_out(bcolors.NONE, " * skipping already submitted image '%s'" % (url,)) else: colored_out(bcolors.OKGREEN, " * Adding " + ",".join(types) + (" " if len(types) > 0 else "") + "cover art '%s'" % (url,)) content = urllib2.urlopen(url).read() image_file = tempfile.NamedTemporaryFile(delete=False) image_file.write(content) image_file.close() im = Image.open(image_file.name) edit_note = "'''Dimension''': %sx%s\n'''Source''': %s" % (im.size[0], im.size[1], url) time.sleep(5) mb.add_cover_art(release, image_file.name, types, None, u'', edit_note, False) os.remove(image_file.name) save_processed(release, url)
def submit_cover_art(release, url, types): if already_processed(release, url): colored_out(bcolors.NONE, " * skipping already submitted image '%s'" % (url,)) else: colored_out(bcolors.OKGREEN, " * Adding " + ",".join(types) + (" " if len(types) > 0 else "") + "cover art '%s'" % (url,)) if 'discogs' in url: headers = {'user-agent': 'MusicBrainzBot/0.1 +https://github.com/murdos/musicbrainz-bot'} resp, content = discogs_oauth_client.request(url, 'GET', headers=headers) else: content = urllib2.urlopen(url).read() image_file = tempfile.NamedTemporaryFile(delete=False) image_file.write(content) image_file.close() im = Image.open(image_file.name) edit_note = "'''Dimension''': %sx%s\n'''Source''': %s" % (im.size[0], im.size[1], url) time.sleep(5) mb.add_cover_art(release, image_file.name, types, None, u'', edit_note, False) os.remove(image_file.name) save_processed(release, url)
def submit_cover_art(release, url, types): if already_processed(release, url): colored_out(bcolors.NONE, " * skipping already submitted image '%s'" % (url, )) else: colored_out( bcolors.OKGREEN, " * Adding " + ",".join(types) + (" " if len(types) > 0 else "") + "cover art '%s'" % (url, )) content = urllib2.urlopen(url).read() image_file = tempfile.NamedTemporaryFile(delete=False) image_file.write(content) image_file.close() im = Image.open(image_file.name) edit_note = "'''Dimension''': %sx%s\n'''Source''': %s" % ( im.size[0], im.size[1], url) time.sleep(5) mb.add_cover_art(release, image_file.name, types, None, u'', edit_note, False) os.remove(image_file.name) save_processed(release, url)
def determine_country(page): all_countries = set() all_reasons = [] countries, reason = determine_country_from_infobox(page) if countries: all_countries.update(countries) all_reasons.append(reason) countries, reason = determine_country_from_text(page) if countries: all_countries.update(countries) all_reasons.append(reason) countries, reason, category_count = determine_country_from_categories(page) has_categories = False if countries: all_countries.update(countries) all_reasons.append(reason) has_categories = True if len(all_reasons) < 1 or not all_countries or not has_categories: colored_out(bcolors.WARNING, ' * not enough sources for countries', all_countries, all_reasons) return None, [] if len(all_countries) > 1: colored_out(bcolors.FAIL, ' * conflicting countries', all_countries, all_reasons) return None, [] country = list(all_countries)[0] colored_out(bcolors.OKGREEN, ' * new country: ', country) return country, all_reasons
def determine_gender(page): all_genders = set() all_reasons = [] genders, reason = determine_gender_from_firstname(page) if genders: all_genders.update(genders) all_reasons.append(reason) genders, reason = determine_gender_from_categories(page) if genders: all_genders.update(genders) all_reasons.append(reason) genders, reason = determine_gender_from_text(page) if genders: all_genders.update(genders) all_reasons.append(reason) if not all_reasons: colored_out(bcolors.WARNING, ' * not enough sources for genders') return None, [] if len(all_genders) > 1: colored_out(bcolors.FAIL, ' * conflicting genders', all_genders, all_reasons) return None, [] gender = list(all_genders)[0] colored_out(bcolors.OKGREEN, ' * new gender:', gender) return gender, all_reasons
def discogs_get_release_packaging(discogs_release): #if len(discogs_release.data['formats']) > 1: # return None for format in discogs_release.data['formats']: if 'text' not in format: print 'No text found for format %s' % format['name'] continue freetext = format['text'].lower().replace('-', '').replace(' ', '') colored_out(bcolors.HEADER, ' * Discogs format text: %s' % freetext) if 'cardboard' in freetext or 'paper' in freetext: return "cardboard/paper sleeve" elif 'digipak' in freetext or 'digipack' in freetext: return "digipak" elif 'keepcase' in freetext: return "keep case" elif 'jewel' in freetext: if 'slim' in freetext: return "slim jewel case" else: return "jewel case" return None
def discogs_get_release_packaging(discogs_release): #if len(discogs_release.data['formats']) > 1: # return None for format in discogs_release.data['formats']: if 'text' not in format: print 'No text found for format %s' % format['name'] continue freetext = format['text'].lower().replace('-', '').replace(' ', '') colored_out(bcolors.HEADER, ' * Discogs format text: %s' % freetext) if 'cardboard' in freetext or 'paper' in freetext: return "cardboard/paper sleeve"; elif 'digipak' in freetext or 'digipack' in freetext: return "digipak"; elif 'keepcase' in freetext: return "keep case"; elif 'jewel' in freetext: if 'slim' in freetext: return "slim jewel case" else: return "jewel case" return None
def determine_type(page): all_types = set() all_reasons = [] types, reason = determine_type_from_page(page) if types: all_types.update(types) all_reasons.append(reason) if not all_reasons: colored_out(bcolors.WARNING, ' * not enough sources for types') return None, [] if len(all_types) > 1: colored_out(bcolors.FAIL, ' * conflicting types', all_types, all_reasons) return None, [] type = list(all_types)[0] colored_out(bcolors.OKGREEN, ' * new type:', type) return type, all_reasons
releases_wo_7inch AS ( SELECT r.id, u.url, m.format FROM release r JOIN medium m ON m.release = r.id JOIN l_release_url l ON l.entity0 = r.id AND l.link IN (SELECT id FROM link WHERE link_type = 78) JOIN url u ON u.id = l.entity1 WHERE u.url LIKE 'http://www.encyclopedisque.fr/images/%%' AND (m.format IS NULL OR m.format = 7) AND NOT EXISTS (SELECT 1 FROM l_release_url WHERE l_release_url.entity1 = u.id AND l_release_url.entity0 <> r.id) ) SELECT r.id, r.gid, r.name, ta.url, ta.format, ac.name FROM releases_wo_7inch ta JOIN s_release r ON ta.id = r.id JOIN s_artist_credit ac ON r.artist_credit=ac.id LEFT JOIN bot_encyclopedisque_medium_format b ON r.gid = b.gid WHERE b.gid IS NULL ORDER BY r.artist_credit, r.id LIMIT 100 """ for id, gid, name, url, format, ac_name in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (name, ac_name, gid)) edit_note = 'Setting format to 7" based on attached link to Encyclopedisque (%s)' % url out(' * edit note: %s' % (edit_note,)) mb.set_release_medium_format(gid, format, 29, edit_note) time.sleep(5) db.execute("INSERT INTO bot_encyclopedisque_medium_format (gid) VALUES (%s)", (gid,))
DISCOGS_MB_FORMATS_MAPPING = { "Vinyl": 7, '12"': 31, '10"': 30, '7"': 29, "CD": 1, "CDr": 33, "Cassette": 8, "DigitalMedia": 12, } for medium in db.execute(query): colored_out( bcolors.OKBLUE, 'Looking up medium #%s of release "%s" by "%s" http://musicbrainz.org/release/%s' % (medium["position"], medium["name"], medium["ac_name"], medium["gid"]), ) m = re.match(r"http://www.discogs.com/release/([0-9]+)", medium["discogs_url"]) if m: discogs_release = discogs.Release(int(m.group(1))) discogs_format = discogs_get_medium_format(discogs_release, medium["position"]) if discogs_format: colored_out(bcolors.HEADER, " * using %s, found format: %s" % (medium["discogs_url"], discogs_format)) edit_note = "Setting medium format from attached Discogs link (%s)" % medium["discogs_url"] out(" * edit note: %s" % (edit_note,)) mb.set_release_medium_format( medium["gid"], medium["position"],
"Italian": 195, "Japanese": 198, "Multiple languages": 284, "None": 486, "Norwegian": 309, "Polish": 338, "Portuguese": 340, "Russian": 353, "Spanish": 393, "Swedish": 403, "Turkish": 433, } for work in db.execute(query): colored_out( bcolors.OKBLUE, 'Looking up work "%s" http://musicbrainz.org/work/%s' % (work['name'], work['gid'])) m = re.match(r'http://www.secondhandsongs.com/work/([0-9]+)', work['shs_url']) if m: shs_work = shs.lookup_work(int(m.group(1))) else: continue if 'language' in shs_work: work = dict(work) shs_lang = shs_work['language'] if shs_lang not in SHS_MB_LANG_MAPPING: colored_out(bcolors.FAIL, ' * No mapping defined for language '
return '10"' return None DISCOGS_MB_FORMATS_MAPPING = { 'Vinyl': 7, '12"': 31, '10"': 30, '7"': 29, 'CD': 1, 'CDr': 33, 'Cassette': 8, 'DigitalMedia': 12 } for medium in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up medium #%s of release "%s" by "%s" http://musicbrainz.org/release/%s' % (medium['position'], medium['name'], medium['ac_name'], medium['gid'])) m = re.match(r'http://www.discogs.com/release/([0-9]+)', medium['discogs_url']) if m: discogs_release = discogs.release(int(m.group(1))) discogs_format = discogs_get_medium_format(discogs_release, medium['position']) if discogs_format: colored_out(bcolors.HEADER, ' * using %s, found format: %s' % (medium['discogs_url'], discogs_format)) edit_note = 'Setting medium format from attached Discogs link (%s)' % medium['discogs_url'] out(' * edit note: %s' % (edit_note,)) mb.set_release_medium_format(medium['gid'], medium['position'], medium['format'], DISCOGS_MB_FORMATS_MAPPING[discogs_format], edit_note, True) else: colored_out(bcolors.FAIL, ' * using %s, no matching format has been found' % (medium['discogs_url'],)) if medium['processed'] is None:
def main(verbose=False): edits_left = mb.edits_left() releases = [(r, gid, barcode, name, ac, country, year, month, day) for r, gid, barcode, name, ac, country, year, month, day in db.execute(query_releases_without_asin)] count = len(releases) for i, (r, gid, barcode, name, ac, country, year, month, day) in enumerate(releases): if edits_left <= 0: break if gid in asin_missing or gid in asin_problematic or gid in asin_nocover or gid in asin_catmismatch: continue if not barcode_type(barcode): db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if country not in store_map_rev: continue if barcode.lstrip('0') in barcodes_hist and barcodes_hist[ barcode.lstrip('0')] > 1: if verbose: colored_out(bcolors.WARNING, ' two releases with same barcode, skip for now') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if verbose: colored_out( bcolors.OKBLUE, u'%d/%d - %.2f%% - %s https://musicbrainz.org/release/%s %s %s' % (i + 1, count, (i + 1) * 100.0 / count, name, gid, barcode, country)) try: mb_date = datetime.datetime(year if year else 1, month if month else 1, day if day else 1) item = amazon_get_asin(barcode, country, mb_date) except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e: out(e) continue if item is None: if verbose: out(' * not found, continue') db.execute("INSERT INTO bot_asin_missing (gid) VALUES (%s)", gid) continue url = amazon_url_cleanup(str(item.DetailPageURL), str(item.ASIN)) if verbose: out(' * barcode matches %s' % url) if item.ASIN in asins: if verbose: out(' * skip, ASIN already in DB') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if not 'LargeImage' in item.__dict__: if verbose: out(' * skip, has no image') db.execute("INSERT INTO bot_asin_nocover (gid) VALUES (%s)", gid) continue attrs = item.ItemAttributes if 'Format' in attrs.__dict__ and 'Import' in [ f for f in attrs.Format ]: if verbose: out(' * skip, is marked as Import') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue amazon_name = unicode(attrs.Title) catnr = None if 'SeikodoProductCode' in attrs.__dict__: catnr = unicode(attrs.SeikodoProductCode) elif 'MPN' in attrs.__dict__: catnr = unicode(attrs.MPN) matched = False if catnr: for mb_catnr in release_catnrs(r): if cat_compare(mb_catnr, catnr, country): matched = True break if not matched and country == 'JP': if verbose: colored_out(bcolors.FAIL, u' * CAT NR MISMATCH, ARGH!') db.execute( "INSERT INTO bot_asin_catmismatch (gid) VALUES (%s)", gid) continue if not matched: catnr = None if not are_similar(name, amazon_name): if verbose: colored_out( bcolors.FAIL, u' * Similarity too small: %s <-> %s' % (name, amazon_name)) db.execute( "INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if (gid, url) in asin_set: if verbose: colored_out( bcolors.WARNING, u' * already linked earlier (probably got removed by some editor!)' ) continue text = u'%s lookup for “%s” (country: %s), ' % (barcode_type(barcode), barcode, country) if catnr: text += u'matching catalog numer “%s”, release name is “%s”' % ( catnr, attrs.Title) else: text += u'has similar name “%s”' % attrs.Title if 'Artist' in attrs.__dict__: text += u' by “%s”' % attrs.Artist text += u'.\nAmazon.com: ' if 'Binding' in attrs.__dict__: if 'NumberOfDiscs' in attrs.__dict__: text += u'%s × ' % attrs.NumberOfDiscs text += u'%s' % attrs.Binding if not catnr and 'Label' in attrs.__dict__: text += u', %s' % attrs.Label if 'ReleaseDate' in attrs.__dict__: text += u', %s' % attrs.ReleaseDate text += u'\nMusicBrainz: ' text += u'%s' % release_format(r) if not catnr: labels = release_labels(r) if labels: text += u', %s' % u' / '.join(labels) if year: text += u', %s' % date_format(year, month, day) if catnr and country == 'JP': text += u'\nhttp://amazon.jp/s?field-keywords=%s\nhttp://amazon.jp/s?field-keywords=%s' % ( catnr, barcode) else: text += u'\nhttp://amazon.%s/s?field-keywords=%s' % ( amazon_url_tld(url), barcode) # make "Import" bold so it is easier recognizable re_bold_import = re.compile(ur'\b(imports?)\b', re.IGNORECASE) text = re_bold_import.sub(ur"'''\1'''", text) try: colored_out( bcolors.OKGREEN, u' * https://musicbrainz.org/release/%s -> %s' % (gid, url)) mb.add_url('release', gid, 77, url, text) db.execute("INSERT INTO bot_asin_set (gid,url) VALUES (%s,%s)", (gid, url)) asins.add(url) edits_left -= 1 except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e: out(e)
from cStringIO import StringIO import time from editing import MusicBrainzWebdriverClient import socket from utils import out, colored_out, bcolors, monkeypatch_mechanize import config as cfg try: import config_caa as cfg_caa except ImportError: cfg_caa = cfg try: import discogs_client except ImportError as err: colored_out( bcolors.FAIL, "Error: Cannot use Discogs: %s\n" % err + "Run 'pip install discogs-client' or get discogs_client.py from\n" "https://github.com/discogs/discogs_client") sys.exit(1) # Optional modules try: import amazonproduct from amazonproduct.contrib.retry import RetryAPI except ImportError as err: colored_out(bcolors.WARNING, "Warning: Cannot use Amazon: %s" % err) amazonproduct = None try: from mbbot.source.spotify import SpotifyWebService spotify = SpotifyWebService() except ImportError as err:
(mb_release['gid'])) else: db.execute( "UPDATE bot_isrc_spotify SET processed = now() WHERE release = %s", (mb_release['gid'])) sws = SpotifyWebService() musicbrainzngs.auth(cfg.MB_USERNAME, cfg.MB_PASSWORD) for release in db.execute(query_releases_wo_isrcs): mb_release = dict(release) colored_out( bcolors.OKBLUE, 'Looking up release "%s" https://musicbrainz.org/release/%s' % (mb_release['name'], mb_release['gid'])) sp_albums = sws.search_albums('upc:%s' % mb_release['barcode']) if len(sp_albums) != 1: if len(sp_albums) == 0: out(' * no spotify release found') if len(sp_albums) > 1: out(' * multiple spotify releases found') save_processing(mb_release) continue sp_uri = sp_albums[0]['href'] sp_release = sws.lookup(sp_uri, detail=2) for track in sp_release['tracks']: for extid in track['external-ids']:
def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace("-", "_") url_relationship_table = "l_%s_url" % entity_type_table if ENTITY_TYPE != "work" else "l_url_%s" % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != "work" else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != "work" else "entity0" query = ( """ WITH entities_wo_wikidata AS ( SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON u.id = l.""" + url_entity_point + """ AND u.url LIKE 'http://%%.wikipedia.org/wiki/%%' AND substring(u.url from 8 for 2) IN ('en', 'fr') WHERE /* No existing WikiData relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = u.id AND ol.""" + main_entity_entity_point + """ <> e.id) AND l.edits_pending = 0 ) SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN s_""" + entity_type_table + """ e ON ewf.entity_id = e.id LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = substring(ewf.wp_url from 8 for 2) ORDER BY b.processed NULLS FIRST, e.id LIMIT 250 """ ) seen = set() matched = set() for entity in db.execute(query): if entity["gid"] in matched: continue colored_out( bcolors.OKBLUE, 'Looking up entity "%s" http://musicbrainz.org/%s/%s' % (entity["name"], ENTITY_TYPE, entity["gid"]), ) out(" * wiki:", entity["wp_url"]) page = WikiPage.fetch(entity["wp_url"], False) if page.wikidata_id: wikidata_url = "http://www.wikidata.org/wiki/%s" % page.wikidata_id.upper() edit_note = "From %s" % (entity["wp_url"],) colored_out(bcolors.OKGREEN, " * found WikiData identifier:", wikidata_url) time.sleep(3) out(" * edit note:", edit_note.replace("\n", " ")) mb.add_url( ENTITY_TYPE.replace("-", "_"), entity["gid"], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True, ) matched.add(entity["gid"]) if entity["processed"] is None and entity["gid"] not in seen: db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity["gid"], page.lang)) else: db.execute( "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity["gid"], page.lang), ) seen.add(entity["gid"])
def main(): seen = set() for artist in db.execute(query): if artist['id'] in seen: continue seen.add(artist['id']) colored_out( bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) out(' * wiki:', artist['url']) artist = dict(artist) update = set() reasons = [] page = WikiPage.fetch(artist['url'], False) if not artist['area']: country, country_reasons = determine_country(page) if country: country_id = country_ids[country] artist['area'] = country_id update.add('area') reasons.append(('COUNTRY', country_reasons)) if not artist['type']: type, type_reasons = determine_type(page) if type: type_id = artist_type_ids[type] artist['type'] = type_id update.add('type') reasons.append(('TYPE', type_reasons)) if not artist['gender'] and artist['type'] == 1: gender, gender_reasons = determine_gender(page) if gender: gender_id = gender_ids[gender] artist['gender'] = gender_id update.add('gender') reasons.append(('GENDER', gender_reasons)) is_performance_name = False if artist['type'] == 1 and CHECK_PERFORMANCE_NAME: is_performance_name = db.execute(performance_name_query, artist['id']).scalar() > 0 out(" * checking for performance name", is_performance_name) if not artist['begin_date_year']: begin_date, begin_date_reasons = determine_begin_date( artist, page, is_performance_name) if begin_date['year']: colored_out(bcolors.OKGREEN, " * new begin date:", begin_date) artist['begin_date_year'] = begin_date['year'] artist['begin_date_month'] = begin_date['month'] artist['begin_date_day'] = begin_date['day'] update.add('begin_date') reasons.append(('BEGIN DATE', begin_date_reasons)) if not artist['end_date_year']: end_date, end_date_reasons = determine_end_date( artist, page, is_performance_name) if end_date['year']: colored_out(bcolors.OKGREEN, " * new end date:", end_date) artist['end_date_year'] = end_date['year'] artist['end_date_month'] = end_date['month'] artist['end_date_day'] = end_date['day'] update.add('end_date') reasons.append(('END DATE', end_date_reasons)) if update: edit_note = 'From %s' % (artist['url'], ) for field, reason in reasons: edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason)) out(' * edit note:', edit_note.replace('\n', ' ')) time.sleep(10) mb.edit_artist(artist, update, edit_note) if artist['processed'] is None: db.execute( "INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute( "UPDATE bot_wp_artist_data SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
query_album_tracks = """ SELECT DISTINCT t.name FROM track t JOIN medium m ON t.medium=m.id JOIN release r ON m.release=r.id WHERE r.release_group = %s """ category_re = {} category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]') category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]') for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute( query, query_params): colored_out( bcolors.OKBLUE, 'Looking up release group "%s" http://musicbrainz.org/release-group/%s' % (rg_name, rg_gid)) matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results last_wp_request = time.time() for match in matches: title = match['name'] if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name( rg_name) and mangle_name(title) != mangle_name(rg_name): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time()
""" query_artist_albums = """ SELECT rg.name FROM s_release_group rg JOIN artist_credit_name acn ON rg.artist_credit = acn.artist_credit WHERE acn.artist = %s UNION SELECT r.name FROM s_release r JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit WHERE acn.artist = %s """ for artist in db.execute(query, query_params): colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) matches = wps.query(escape_query(artist['name']), defType='dismax', qf='name', rows=50).results last_wp_request = time.time() for match in matches: title = match['name'] if title.endswith('album)') or title.endswith('song)'): continue if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(artist['name']) and mangle_name(title) != mangle_name(artist['name']): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() wikipage = WikiPage.fetch('http://%s.wikipedia.org/wiki/%s' % (wp_lang, title)) page_orig = wikipage.text if not page_orig:
def main(): seen = set() for artist in db.execute(query): if artist['id'] in seen: continue seen.add(artist['id']) colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) out(' * wiki:', artist['url']) artist = dict(artist) update = set() reasons = [] page = WikiPage.fetch(artist['url']) if not artist['country']: country, country_reasons = determine_country(page) if country: country_id = country_ids[country] artist['country'] = country_id update.add('country') reasons.append(('COUNTRY', country_reasons)) if not artist['type']: type, type_reasons = determine_type(page) if type: type_id = artist_type_ids[type] artist['type'] = type_id update.add('type') reasons.append(('TYPE', type_reasons)) if not artist['gender'] and artist['type'] == 1: gender, gender_reasons = determine_gender(page) if gender: gender_id = gender_ids[gender] artist['gender'] = gender_id update.add('gender') reasons.append(('GENDER', gender_reasons)) is_performance_name = False if artist['type'] == 1 and CHECK_PERFORMANCE_NAME: is_performance_name = db.execute(performance_name_query, artist['id']).scalar() > 0 out(" * checking for performance name", is_performance_name) if not artist['begin_date_year']: begin_date, begin_date_reasons = determine_begin_date(artist, page, is_performance_name) if begin_date['year']: colored_out(bcolors.OKGREEN, " * new begin date:", begin_date) artist['begin_date_year'] = begin_date['year'] artist['begin_date_month'] = begin_date['month'] artist['begin_date_day'] = begin_date['day'] update.add('begin_date') reasons.append(('BEGIN DATE', begin_date_reasons)) if not artist['end_date_year']: end_date, end_date_reasons = determine_end_date(artist, page, is_performance_name) if end_date['year']: colored_out(bcolors.OKGREEN, " * new end date:", end_date) artist['end_date_year'] = end_date['year'] artist['end_date_month'] = end_date['month'] artist['end_date_day'] = end_date['day'] update.add('end_date') reasons.append(('END DATE', end_date_reasons)) if update: edit_note = 'From %s' % (artist['url'],) for field, reason in reasons: edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason)) out(' * edit note:', edit_note.replace('\n', ' ')) time.sleep(10) mb.edit_artist(artist, update, edit_note) db.execute("INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) out()
) SELECT a.id, a.gid, a.name, aws.shs_url, aws.work_id, aws.work_gid, b.processed FROM artists_wo_shs aws JOIN artist a ON aws.artist_id = a.id LEFT JOIN bot_shs_link_artist b ON a.gid = b.artist ORDER BY b.processed NULLS FIRST, a.id LIMIT 1000 """ seen_artists = set() matched_artists = set() for artist in db.execute(query): if artist['gid'] in matched_artists: continue colored_out(bcolors.OKBLUE, 'Looking up artist "%s" https://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) m = re.match(r'http://www.secondhandsongs.com/work/([0-9]+)', artist['shs_url']) if m: shs_work = shs.lookup_work(int(m.group(1))) else: continue artist_uri = None shs_artists = [] # credits of actual work if 'credits' in shs_work and len(shs_work['credits']) > 0: shs_artists.extend(shs_work['credits']) # credits of original work if 'originalCredits' in shs_work and len(shs_work['originalCredits']) > 0: shs_artists.extend(shs_work['originalCredits'])
JOIN url u ON u.id = l.entity1 WHERE u.url LIKE 'http://www.encyclopedisque.fr/images/%%' AND (m.format IS NULL OR m.format = 7) AND NOT EXISTS (SELECT 1 FROM l_release_url WHERE l_release_url.entity1 = u.id AND l_release_url.entity0 <> r.id) ) SELECT r.id, r.gid, r.name, ta.url, ta.format, ac.name FROM releases_wo_7inch ta JOIN s_release r ON ta.id = r.id JOIN s_artist_credit ac ON r.artist_credit=ac.id LEFT JOIN bot_encyclopedisque_medium_format b ON r.gid = b.gid WHERE b.gid IS NULL ORDER BY r.artist_credit, r.id LIMIT 100 """ for id, gid, name, url, format, ac_name in db.execute(query): colored_out( bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (name, ac_name, gid)) edit_note = 'Setting format to 7" based on attached link to Encyclopedisque (%s)' % url out(' * edit note: %s' % (edit_note, )) mb.set_release_medium_format(gid, format, 29, edit_note) time.sleep(5) db.execute( "INSERT INTO bot_encyclopedisque_medium_format (gid) VALUES (%s)", (gid, ))
return None return consolidated_formats.pop() DISCOGS_MB_FORMATS_MAPPING = { 'Vinyl': 7, '12"': 31, '10"': 30, '7"': 29, 'CD': 1, 'CDr': 33, 'Cassette': 8, 'DigitalMedia': 12 } for medium in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up medium #%s of release "%s" by "%s" https://musicbrainz.org/release/%s' % (medium['medium_id'], medium['name'], medium['ac_name'], medium['gid'])) m = re.match(r'http://www.discogs.com/release/([0-9]+)', medium['discogs_url']) if m: discogs_release = discogs.release(int(m.group(1))) discogs_format = discogs_get_medium_format(discogs_release, medium['position']) if discogs_format is None: colored_out(bcolors.WARNING, ' * using %s, no matching format has been found' % medium['discogs_url']) elif discogs_format not in DISCOGS_MB_FORMATS_MAPPING: colored_out(bcolors.WARNING, ' * using %s, found unknown format %s' % (medium['discogs_url'], discogs_format)) elif DISCOGS_MB_FORMATS_MAPPING[discogs_format] == medium['format']: colored_out(bcolors.WARNING, ' * using %s, no better format found' % medium['discogs_url']) else: colored_out(bcolors.NONE, ' * using %s, found format: %s' % (medium['discogs_url'], discogs_format)) edit_note = 'Setting medium format from attached Discogs link (%s)' % medium['discogs_url']
JOIN l_recording_work lrw ON recording.id = lrw.entity0 JOIN l_artist_work law ON lrw.entity1 = law.entity1 WHERE acn.artist = %s UNION -- Select artists of recordings of works for this artist (i.e. performers of works this artist wrote) SELECT acn.artist AS artist FROM artist_credit_name acn JOIN recording ON acn.artist_credit = recording.artist_credit JOIN l_recording_work lrw ON recording.id = lrw.entity0 JOIN l_artist_work law ON lrw.entity1 = law.entity1 WHERE law.entity0 = %s ) """ for artist in db.execute(query, query_params): colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) matches = wps.query(escape_query(artist['name']), defType='dismax', qf='name', rows=50).results last_wp_request = time.time() for match in matches: title = match['name'] if title.endswith('album)') or title.endswith('song)'): continue if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(artist['name']) and mangle_name(title) != mangle_name(artist['name']): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() wikipage = WikiPage.fetch('https://%s.wikipedia.org/wiki/%s' % (wp_lang, title)) page_orig = wikipage.text if not page_orig:
def getImages(self, type=None): if type is None: images = self.metadata['images'] else: images = [] for image in self.metadata['images']: for img_type in image['types']: if img_type == type: images.append(image) break return images for file in sys.argv[1:]: colored_out(bcolors.OKBLUE, "File '%s'" % os.path.basename(file)) if not os.path.exists(file): colored_out(bcolors.FAIL, " * File not found") continue m = FILE_RE.match(os.path.basename(file)) if m is None: colored_out(bcolors.FAIL, " * File doesn't match defined regular expression") continue mbid = m.group('mbid') type = m.group('type') caa_rel_info = CoverArtArchiveReleaseInfo(mbid) if caa_rel_info.hasType(type) and type not in ('medium', 'booklet') and False: colored_out(
import hashlib import base64 from utils import structureToString, colored_out from datetime import datetime from mbbot.guesscase import guess_artist_sort_name # Optional modules try: from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException except ImportError as err: colored_out(bcolors.WARNING, "Warning: Cannot use Selenium Webdriver client: %s" % err) webdriver = None try: from pyvirtualdisplay import Display except ImportError as err: colored_out( bcolors.WARNING, "Warning: Cannot run Selenium Webdriver client in headless mode: %s" % err) Display = None def format_time(secs): return '%0d:%02d' % (secs // 60, secs % 60)
name2) return ratio >= threshold def discogs_get_tracklist(release_url): m = re.match(r'http://www.discogs.com/release/([0-9]+)', release_url) if m: release_id = int(m.group(1)) release = discogs.release(release_id) return [track for track in release.tracklist if track.position != ''] return None for release in db.execute(query): colored_out( bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (release['name'], release['ac_name'], release['gid'])) discogs_tracks = discogs_get_tracklist(release['discogs_url']) if (len(discogs_tracks) != release['track_count']): colored_out( bcolors.FAIL, ' * number of tracks mismatches (Discogs: %s vs MB: %s)' % (len(discogs_tracks), release['track_count'])) else: changed = False new_mediums = [] position = 0 for mb_track in db.execute(query_release_tracks, (release['id'], )): new_track = {'id': mb_track['id']} if len(new_mediums) < mb_track['medium_position']:
JOIN l_artist_work law ON lrw.entity1 = law.entity1 WHERE acn.artist = %s UNION -- Select artists of recordings of works for this artist (i.e. performers of works this artist wrote) SELECT acn.artist AS artist FROM artist_credit_name acn JOIN recording ON acn.artist_credit = recording.artist_credit JOIN l_recording_work lrw ON recording.id = lrw.entity0 JOIN l_artist_work law ON lrw.entity1 = law.entity1 WHERE law.entity0 = %s ) """ for artist in db.execute(query, query_params): colored_out( bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) matches = wps.query(escape_query(artist['name']), defType='dismax', qf='name', rows=50).results last_wp_request = time.time() for match in matches: title = match['name'] if title.endswith('album)') or title.endswith('song)'): continue if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name( artist['name']) and mangle_name(title) != mangle_name( artist['name']): continue delay = time.time() - last_wp_request
for recording in db.execute(query): m = re.match(date_re, recording["comment"]) if m is None: continue date = {"year": int(m.group(1))} if m.group(2) is not None: date["month"] = int(m.group(2)) if m.group(3) is not None: date["day"] = int(m.group(3)) colored_out( bcolors.OKBLUE, 'Setting performance relationships dates of http://musicbrainz.org/recording/%s "%s (%s)"' % (recording["gid"], recording["name"], recording["comment"]), ) attributes = {} edit_note = 'Setting relationship dates from recording comment: "%s"' % recording["comment"] colored_out(bcolors.NONE, " * new date:", date) time.sleep(15) mb.edit_relationship( recording["rel_id"], "recording", "work", recording["link_type"], recording["link_type"], attributes,
""" query_album_tracks = """ SELECT DISTINCT t.name FROM track t JOIN medium m ON t.medium=m.id JOIN release r ON m.release=r.id WHERE r.release_group = %s """ category_re = {} category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]') category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]') for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute(query, query_params): colored_out(bcolors.OKBLUE, 'Looking up release group "%s" https://musicbrainz.org/release-group/%s' % (rg_name, rg_gid)) matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results last_wp_request = time.time() for match in matches: title = match['name'] if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(rg_name) and mangle_name(title) != mangle_name(rg_name): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() wikipage = WikiPage.fetch('https://%s.wikipedia.org/wiki/%s' % (wp_lang, title)) page_orig = wikipage.text if not page_orig: continue page_title = title
return None DISCOGS_MB_PACKAGING_MAPPING = { 'jewel case': 1, 'slim jewel case': 2, 'digipak': 3, 'cardboard/paper sleeve': 4, 'other': 5, 'keep case': 6, 'none': 7, } for release in db.execute(query): colored_out( bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (release['name'], release['ac_name'], release['gid'])) m = re.match(r'http://www.discogs.com/release/([0-9]+)', release['discogs_url']) if m: discogs_release = discogs.Release(int(m.group(1))) discogs_packaging = discogs_get_release_packaging(discogs_release) if discogs_packaging: colored_out( bcolors.OKGREEN, ' * using %s, found packaging: %s' % (release['discogs_url'], discogs_packaging)) edit_note = 'Setting release packaging from attached Discogs link (%s)' % release[ 'discogs_url'] out(' * edit note: %s' % (edit_note, ))
"German": 145, "Greek": 159, "Italian": 195, "Japanese": 198, "[Multiple languages]": 284, "Norwegian": 309, "Polish": 338, "Portuguese": 340, "Russian": 353, "Spanish": 393, "Swedish": 403, "Turkish": 433, } for work in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up work "%s" http://musicbrainz.org/work/%s' % (work['name'], work['gid'])) m = re.match(r'http://www.secondhandsongs.com/work/([0-9]+)', work['shs_url']) if m: shs_work = shs.lookup_work(int(m.group(1))) else: continue if 'language' in shs_work: work = dict(work) shs_lang = shs_work['language'] if shs_lang not in SHS_MB_LANG_MAPPING: colored_out(bcolors.FAIL, ' * No mapping defined for language ''%s' % shs_lang) else: work['iswcs'] = []
def main(ENTITY_TYPE): entity_type_table = ENTITY_TYPE.replace('-', '_') url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1" url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0" query = """ WITH entities_wo_wikidata AS ( SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang FROM """ + entity_type_table + """ e JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """ + str( WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """) JOIN url u ON u.id = l.""" + url_entity_point + """ AND u.url ~ '^https?://[a-z-]+\.wikipedia\.org/wiki/' WHERE /* No existing Wikidata relationship for this entity */ NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """ + str( WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)) /* WP link should only be linked to this entity */ AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = u.id AND ol.""" + main_entity_entity_point + """ <> e.id) AND l.edits_pending = 0 ) SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed FROM entities_wo_wikidata ewf JOIN """ + entity_type_table + """ e ON ewf.entity_id = e.id LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang ORDER BY b.processed NULLS FIRST, e.id LIMIT 500 """ seen = set() matched = set() for entity in db.execute(query): if entity['gid'] in matched: continue colored_out( bcolors.OKBLUE, 'Looking up entity "%s" http://musicbrainz.org/%s/%s' % (entity['name'], ENTITY_TYPE, entity['gid'])) out(' * wiki:', entity['wp_url']) page = WikiPage.fetch(entity['wp_url'], False) if page.wikidata_id: wikidata_url = 'https://www.wikidata.org/wiki/%s' % page.wikidata_id.upper( ) edit_note = 'From %s' % (entity['wp_url'], ) colored_out(bcolors.OKGREEN, ' * found Wikidata identifier:', wikidata_url) time.sleep(1) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True) matched.add(entity['gid']) if entity['processed'] is None and entity['gid'] not in seen: db.execute( "INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity['gid'], page.lang)) else: db.execute( "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity['gid'], page.lang)) seen.add(entity['gid']) stats['seen'][ENTITY_TYPE] = len(seen) stats['matched'][ENTITY_TYPE] = len(matched)
AND NOT EXISTS (SELECT 1 FROM l_recording_work lrw2 WHERE lrw2.entity0 = r.id AND lrw2.entity1 <> lrw.entity1) ORDER BY r.artist_credit LIMIT 750 """ date_re = re.compile(r'live, (\d{4})(?:-(\d{2}))?(?:-(\d{2}))?:', re.I) for recording in db.execute(query): m = re.match(date_re, recording['comment']) if m is None: continue date = {'year': int(m.group(1))} if m.group(2) is not None: date['month'] = int(m.group(2)) if m.group(3) is not None: date['day'] = int(m.group(3)) colored_out(bcolors.OKBLUE, 'Setting performance relationships dates of http://musicbrainz.org/recording/%s "%s (%s)"' % (recording['r_gid'], recording['name'], recording['comment'])) attributes = {} edit_note = 'Setting relationship dates from recording comment: "%s"' % recording['comment'] colored_out(bcolors.NONE, " * new date:", date) entity0 = {'type': 'recording', 'gid': recording['r_gid']} entity1 = {'type': 'work', 'gid': recording['w_gid']} time.sleep(2) mb.edit_relationship(recording['rel_id'], entity0, entity1, recording['link_type'], attributes, date, date, edit_note, True)
AND lt.name = 'performance' AND r.edits_pending = 0 AND lrw.edits_pending = 0 /* Only one linked work */ AND NOT EXISTS (SELECT 1 FROM l_recording_work lrw2 WHERE lrw2.entity0 = r.id AND lrw2.entity1 <> lrw.entity1) ORDER BY b.processed NULLS FIRST, r.artist_credit, r.id LIMIT 750 """ date_re = re.compile(r'live, (\d{4})(?!-\d{4})(?:-(\d{2}))?(?:-(\d{2}))?:?', re.I) for recording in db.execute(query): m = re.match(date_re, recording['comment']) if m is None: colored_out( bcolors.WARNING, 'Skipping https://musicbrainz.org/recording/%s "%s (%s)": not matching regexp' % (recording['r_gid'], recording['name'], recording['comment'])) else: date = {'year': int(m.group(1))} if m.group(2) is not None: date['month'] = int(m.group(2)) if m.group(3) is not None: date['day'] = int(m.group(3)) colored_out( bcolors.OKBLUE, 'Setting performance relationships dates of https://musicbrainz.org/recording/%s "%s (%s)"' % (recording['r_gid'], recording['name'], recording['comment'])) attributes = {}
def main(verbose=False): edits_left = mb.edits_left() releases = [(r, gid, barcode, name, ac, country, year, month, day) for r, gid, barcode, name, ac, country, year, month, day in db.execute(query_releases_without_asin)] count = len(releases) for i, (r, gid, barcode, name, ac, country, year, month, day) in enumerate(releases): if edits_left <= 0: break if gid in asin_missing or gid in asin_problematic or gid in asin_nocover or gid in asin_catmismatch: continue if not barcode_type(barcode): db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if country not in store_map_rev: continue if barcode.lstrip('0') in barcodes_hist and barcodes_hist[barcode.lstrip('0')] > 1: if verbose: colored_out(bcolors.WARNING, ' two releases with same barcode, skip for now') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if verbose: colored_out(bcolors.OKBLUE, u'%d/%d - %.2f%% - %s http://musicbrainz.org/release/%s %s %s' % (i+1, count, (i+1) * 100.0 / count, name, gid, barcode, country)) try: mb_date = datetime.datetime(year if year else 1, month if month else 1, day if day else 1) item = amazon_get_asin(barcode, country, mb_date) except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e: out(e) continue if item is None: if verbose: out(' * not found, continue') db.execute("INSERT INTO bot_asin_missing (gid) VALUES (%s)", gid) continue url = amazon_url_cleanup(str(item.DetailPageURL), str(item.ASIN)) if verbose: out(' * barcode matches %s' % url) if item.ASIN in asins: if verbose: out(' * skip, ASIN already in DB') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if not 'LargeImage' in item.__dict__: if verbose: out(' * skip, has no image') db.execute("INSERT INTO bot_asin_nocover (gid) VALUES (%s)", gid) continue attrs = item.ItemAttributes if 'Format' in attrs.__dict__ and 'Import' in [f for f in attrs.Format]: if verbose: out(' * skip, is marked as Import') db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue amazon_name = unicode(attrs.Title) catnr = None if 'SeikodoProductCode' in attrs.__dict__: catnr = unicode(attrs.SeikodoProductCode) elif 'MPN' in attrs.__dict__: catnr = unicode(attrs.MPN) matched = False if catnr: for mb_catnr in release_catnrs(r): if cat_compare(mb_catnr, catnr, country): matched = True break if not matched and country == 'JP': if verbose: colored_out(bcolors.FAIL, u' * CAT NR MISMATCH, ARGH!') db.execute("INSERT INTO bot_asin_catmismatch (gid) VALUES (%s)", gid) continue if not matched: catnr = None if not are_similar(name, amazon_name): if verbose: colored_out(bcolors.FAIL, u' * Similarity too small: %s <-> %s' % (name, amazon_name)) db.execute("INSERT INTO bot_asin_problematic (gid) VALUES (%s)", gid) continue if (gid, url) in asin_set: if verbose: colored_out(bcolors.WARNING, u' * already linked earlier (probably got removed by some editor!)') continue text = u'%s lookup for “%s” (country: %s), ' % (barcode_type(barcode), barcode, country) if catnr: text += u'matching catalog numer “%s”, release name is “%s”' % (catnr, attrs.Title) else: text += u'has similar name “%s”' % attrs.Title if 'Artist' in attrs.__dict__: text += u' by “%s”' % attrs.Artist text += u'.\nAmazon.com: ' if 'Binding' in attrs.__dict__: if 'NumberOfDiscs' in attrs.__dict__: text += u'%s × ' % attrs.NumberOfDiscs text += u'%s' % attrs.Binding if not catnr and 'Label' in attrs.__dict__: text += u', %s' % attrs.Label if 'ReleaseDate' in attrs.__dict__: text += u', %s' % attrs.ReleaseDate text += u'\nMusicBrainz: ' text += u'%s' % release_format(r) if not catnr: labels = release_labels(r) if labels: text += u', %s' % u' / '.join(labels) if year: text += u', %s' % date_format(year, month, day) if catnr and country == 'JP': text += u'\nhttp://amazon.jp/s?field-keywords=%s\nhttp://amazon.jp/s?field-keywords=%s' % (catnr, barcode) else: text += u'\nhttp://amazon.%s/s?field-keywords=%s' % (amazon_url_tld(url), barcode) # make "Import" bold so it is easier recognizable re_bold_import = re.compile(ur'\b(imports?)\b', re.IGNORECASE) text = re_bold_import.sub(ur"'''\1'''", text) try: colored_out(bcolors.OKGREEN, u' * http://musicbrainz.org/release/%s -> %s' % (gid, url)) mb.add_url('release', gid, 77, url, text) db.execute("INSERT INTO bot_asin_set (gid,url) VALUES (%s,%s)", (gid, url)) asins.add(url) edits_left -= 1 except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e: out(e)
from cStringIO import StringIO import time from editing import MusicBrainzWebdriverClient import socket from utils import out, colored_out, bcolors, monkeypatch_mechanize import config as cfg try: import config_caa as cfg_caa except ImportError: cfg_caa = cfg try: import discogs_client except ImportError as err: colored_out(bcolors.FAIL, "Error: Cannot use Discogs: %s\n" % err + "Run 'pip install discogs-client' or get discogs_client.py from\n" "https://github.com/discogs/discogs_client") sys.exit(1) # Optional modules try: import amazonproduct from amazonproduct.contrib.retry import RetryAPI except ImportError as err: colored_out(bcolors.WARNING, "Warning: Cannot use Amazon: %s" % err) amazonproduct = None try: from mbbot.source.spotify import SpotifyWebService spotify = SpotifyWebService() except ImportError as err:
def main(): seen = set() matched = set() for artist in db.execute(query): if artist['gid'] in matched: continue colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) out(' * wiki:', artist['wp_url']) page = WikiPage.fetch(artist['wp_url'], False) identifiers = determine_authority_identifiers(page) if 'VIAF' in identifiers: if not isinstance(identifiers['VIAF'], basestring): colored_out(bcolors.FAIL, ' * multiple VIAF found: %s' % ', '.join(identifiers['VIAF'])) elif identifiers['VIAF'] == '' or identifiers['VIAF'] is None: colored_out(bcolors.FAIL, ' * invalid empty VIAF found') else: viaf_url = 'http://viaf.org/viaf/%s' % identifiers['VIAF'] edit_note = 'From %s' % (artist['wp_url'],) colored_out(bcolors.OKGREEN, ' * found VIAF:', viaf_url) # Check if this VIAF has not been deleted skip = False try: resp, content = httplib2.Http().request(viaf_url) except socket.error: colored_out(bcolors.FAIL, ' * timeout!') skip = True deleted_message = 'abandonedViafRecord' if skip == False and (resp.status == '404' or deleted_message in content): colored_out(bcolors.FAIL, ' * deleted VIAF!') skip = True if skip == False: time.sleep(3) out(' * edit note:', edit_note.replace('\n', ' ')) mb.add_url('artist', artist['gid'], str(VIAF_RELATIONSHIP_TYPES['artist']), viaf_url, edit_note) matched.add(artist['gid']) if artist['processed'] is None and artist['gid'] not in seen: db.execute("INSERT INTO bot_wp_artist_viaf (gid, lang) VALUES (%s, %s)", (artist['gid'], page.lang)) else: db.execute("UPDATE bot_wp_artist_viaf SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], page.lang)) seen.add(artist['gid'])
return '10"' return None DISCOGS_MB_FORMATS_MAPPING = { 'Vinyl': 7, '12"': 31, '10"': 30, '7"' : 29, 'CD' : 1, 'CDr' : 33, 'Cassette' : 8, 'DigitalMedia': 12 } for medium in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up medium #%s of release "%s" by "%s" http://musicbrainz.org/release/%s' % (medium['position'], medium['name'], medium['ac_name'], medium['gid'])) m = re.match(r'http://www.discogs.com/release/([0-9]+)', medium['discogs_url']) if m: discogs_release = discogs.Release(int(m.group(1))) discogs_format = discogs_get_medium_format(discogs_release, medium['position']) if discogs_format: colored_out(bcolors.HEADER, ' * using %s, found format: %s' % (medium['discogs_url'], discogs_format)) edit_note = 'Setting medium format from attached Discogs link (%s)' % medium['discogs_url'] out(' * edit note: %s' % (edit_note,)) mb.set_release_medium_format(medium['gid'], medium['position'], medium['format'], DISCOGS_MB_FORMATS_MAPPING[discogs_format], edit_note, True) else: colored_out(bcolors.FAIL, ' * using %s, no matching format has been found' % (medium['discogs_url'],)) if medium['processed'] is None:
) SELECT a.id, a.gid, a.name, aws.shs_url, aws.work_id, aws.work_gid, b.processed FROM artists_wo_shs aws JOIN s_artist a ON aws.artist_id = a.id LEFT JOIN bot_shs_link_artist b ON a.gid = b.artist ORDER BY b.processed NULLS FIRST, a.id LIMIT 1000 """ seen_artists = set() matched_artists = set() for artist in db.execute(query): if artist['gid'] in matched_artists: continue colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid'])) m = re.match(r'http://www.secondhandsongs.com/work/([0-9]+)', artist['shs_url']) if m: shs_work = shs.lookup_work(int(m.group(1))) else: continue artist_uri = None shs_artists = [] # credits of actual work if 'credits' in shs_work and len(shs_work['credits']) > 0: shs_artists.extend(shs_work['credits']) # credits of original work if 'originalCredits' in shs_work and len(shs_work['originalCredits']) > 0: shs_artists.extend(shs_work['originalCredits'])
import hashlib import base64 from utils import structureToString, colored_out, bcolors from datetime import datetime from mbbot.guesscase import guess_artist_sort_name # Optional modules try: from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException except ImportError as err: colored_out(bcolors.WARNING, "Warning: Cannot use Selenium Webdriver client: %s" % err) webdriver = None try: from pyvirtualdisplay import Display except ImportError as err: colored_out(bcolors.WARNING, "Warning: Cannot run Selenium Webdriver client in headless mode: %s" % err) Display = None def format_time(secs): return '%0d:%02d' % (secs // 60, secs % 60) def album_to_form(album): form = {} form['artist_credit.names.0.artist.name'] = album['artist']
return "jewel case" return None DISCOGS_MB_PACKAGING_MAPPING = { 'jewel case': 1, 'slim jewel case': 2, 'digipak': 3, 'cardboard/paper sleeve': 4, 'other': 5, 'keep case': 6, 'none': 7, } for release in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (release['name'], release['ac_name'], release['gid'])) m = re.match(r'http://www.discogs.com/release/([0-9]+)', release['discogs_url']) if m: discogs_release = discogs.Release(int(m.group(1))) discogs_packaging = discogs_get_release_packaging(discogs_release) if discogs_packaging: colored_out(bcolors.OKGREEN, ' * using %s, found packaging: %s' % (release['discogs_url'], discogs_packaging)) edit_note = 'Setting release packaging from attached Discogs link (%s)' % release['discogs_url'] out(' * edit note: %s' % (edit_note,)) mb.set_release_packaging(release['gid'], release['packaging'], DISCOGS_MB_PACKAGING_MAPPING[discogs_packaging], edit_note, True) else: colored_out(bcolors.NONE, ' * using %s, no matching packaging has been found' % (release['discogs_url'],)) time.sleep(2)
return False def getImages(self, type=None): if type is None: images = self.metadata['images'] else: images = [] for image in self.metadata['images']: for img_type in image['types']: if img_type == type: images.append(image) break return images for file in sys.argv[1:]: colored_out(bcolors.OKBLUE, "File '%s'" % os.path.basename(file)) if not os.path.exists(file): colored_out(bcolors.FAIL, " * File not found") continue m = FILE_RE.match(os.path.basename(file)) if m is None: colored_out(bcolors.FAIL, " * File doesn't match defined regular expression") continue mbid = m.group('mbid') type = m.group('type') caa_rel_info = CoverArtArchiveReleaseInfo(mbid) if caa_rel_info.hasType(type) and type not in ('medium', 'booklet') and False: colored_out(bcolors.WARNING, " * Release already has an image of type '%s' => skipping" % type.lower()) continue
ratio = Levenshtein.jaro_winkler(name1, name2) # TODO: remove this debug print if ratio < 0.8: print " * ratio = %s => name1 = '%s' vs name2 = '%s'" % (ratio, name1, name2) return ratio >= 0.8 def discogs_get_tracklist(release_url): m = re.match(r'http://www.discogs.com/release/([0-9]+)', release_url) if m: release_id = int(m.group(1)) release = discogs.release(release_id) return [track for track in release.tracklist if track['position'] != ''] return None for release in db.execute(query): colored_out(bcolors.OKBLUE, 'Looking up release "%s" by "%s" http://musicbrainz.org/release/%s' % (release['name'], release['ac_name'], release['gid'])) discogs_tracks = discogs_get_tracklist(release['discogs_url']) if (len(discogs_tracks) != release['track_count']): colored_out(bcolors.HEADER, ' * number of tracks mismatches (Discogs: %s vs MB: %s)' % (len(discogs_tracks), release['track_count'])) else: changed = False new_mediums = [] position = 0 for mb_track in db.execute(query_release_tracks, (release['id'],)): new_track = {} if len(new_mediums) < mb_track['medium_position']: new_mediums.append({'tracklist': []}) new_mediums[-1]['tracklist'].append(new_track) discogs_track = discogs_tracks[position]
def save_processing(mb_release): if mb_release['processed'] is None: db.execute("INSERT INTO bot_isrc_spotify (release) VALUES (%s)", (mb_release['gid'])) else: db.execute("UPDATE bot_isrc_spotify SET processed = now() WHERE release = %s", (mb_release['gid'])) sws = SpotifyWebService() musicbrainzngs.auth(cfg.MB_USERNAME, cfg.MB_PASSWORD) for release in db.execute(query_releases_wo_isrcs): mb_release = dict(release) colored_out(bcolors.OKBLUE, 'Looking up release "%s" https://musicbrainz.org/release/%s' % (mb_release['name'], mb_release['gid'])) sp_albums = sws.search_albums('upc:%s' % mb_release['barcode']) if len(sp_albums) != 1: if len(sp_albums) == 0: out(' * no spotify release found') if len(sp_albums) > 1: out(' * multiple spotify releases found') save_processing(mb_release) continue sp_uri = sp_albums[0]['href'] sp_release = sws.lookup(sp_uri, detail=2) for track in sp_release['tracks']: for extid in track['external-ids']: if extid['type'] == 'isrc':