def main(ENTITY_TYPE):

    entity_type_table = ENTITY_TYPE.replace('-', '_')
    url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table
    main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1"
    url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0"

    query = """
    WITH
        entities_wo_wikidata AS (
            SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang
            FROM """ + entity_type_table + """ e
                JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)
                JOIN url u ON u.id = l.""" + url_entity_point + """ AND u.url LIKE 'http://%%.wikipedia.org/wiki/%%'
            WHERE 
                /* No existing WikiData relationship for this entity */
                NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """ + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """))
                /* WP link should only be linked to this entity */
                AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = u.id AND ol.""" + main_entity_entity_point + """ <> e.id)
                AND l.edits_pending = 0
        )
    SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed
    FROM entities_wo_wikidata ewf
    JOIN """ + entity_type_table + """ e ON ewf.entity_id = e.id
    LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang
    ORDER BY b.processed NULLS FIRST, e.id
    LIMIT 500
    """

    seen = set()
    matched = set()
    for entity in db.execute(query):
        if entity['gid'] in matched:
            continue

        colored_out(bcolors.OKBLUE, 'Looking up entity "%s" http://musicbrainz.org/%s/%s' % (entity['name'], ENTITY_TYPE, entity['gid']))
        out(' * wiki:', entity['wp_url'])

        page = WikiPage.fetch(entity['wp_url'], False)
        if page.wikidata_id:
            wikidata_url = 'http://www.wikidata.org/wiki/%s' % page.wikidata_id.upper()
            edit_note = 'From %s' % (entity['wp_url'],)
            colored_out(bcolors.OKGREEN, ' * found WikiData identifier:', wikidata_url)
            time.sleep(1)
            out(' * edit note:', edit_note.replace('\n', ' '))
            mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'], str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]), wikidata_url, edit_note, True)
            matched.add(entity['gid'])

        if entity['processed'] is None and entity['gid'] not in seen:
            db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity['gid'], page.lang))
        else:
            db.execute("UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)", (entity['gid'], page.lang))
        seen.add(entity['gid'])
    stats['seen'][ENTITY_TYPE] = len(seen)
    stats['matched'][ENTITY_TYPE] = len(matched)
예제 #2
0
def main():
    seen = set()
    matched = set()
    for artist in db.execute(query):
        if artist['gid'] in matched:
            continue

        colored_out(
            bcolors.OKBLUE,
            'Looking up artist "%s" http://musicbrainz.org/artist/%s' %
            (artist['name'], artist['gid']))
        out(' * wiki:', artist['wp_url'])

        page = WikiPage.fetch(artist['wp_url'], False)
        identifiers = determine_authority_identifiers(page)
        if 'VIAF' in identifiers:
            if not isinstance(identifiers['VIAF'], basestring):
                colored_out(
                    bcolors.FAIL, ' * multiple VIAF found: %s' %
                    ', '.join(identifiers['VIAF']))
            elif identifiers['VIAF'] == '' or identifiers['VIAF'] is None:
                colored_out(bcolors.FAIL, ' * invalid empty VIAF found')
            else:
                viaf_url = 'http://viaf.org/viaf/%s' % identifiers['VIAF']
                edit_note = 'From %s' % (artist['wp_url'], )
                colored_out(bcolors.OKGREEN, ' * found VIAF:', viaf_url)
                # Check if this VIAF has not been deleted
                skip = False
                try:
                    resp, content = httplib2.Http().request(viaf_url)
                except socket.error:
                    colored_out(bcolors.FAIL, ' * timeout!')
                    skip = True
                deleted_message = 'abandonedViafRecord'
                if skip == False and (resp.status == '404'
                                      or deleted_message in content):
                    colored_out(bcolors.FAIL, ' * deleted VIAF!')
                    skip = True
                if skip == False:
                    time.sleep(3)
                    out(' * edit note:', edit_note.replace('\n', ' '))
                    mb.add_url('artist', artist['gid'],
                               str(VIAF_RELATIONSHIP_TYPES['artist']),
                               viaf_url, edit_note)
                    matched.add(artist['gid'])

        if artist['processed'] is None and artist['gid'] not in seen:
            db.execute(
                "INSERT INTO bot_wp_artist_viaf (gid, lang) VALUES (%s, %s)",
                (artist['gid'], page.lang))
        else:
            db.execute(
                "UPDATE bot_wp_artist_viaf SET processed = now() WHERE (gid, lang) = (%s, %s)",
                (artist['gid'], page.lang))
        seen.add(artist['gid'])
예제 #3
0
def main():
    seen = set()
    matched = set()
    for artist in db.execute(query):
        if artist['gid'] in matched:
            continue

        colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid']))
        out(' * wiki:', artist['wp_url'])

        page = WikiPage.fetch(artist['wp_url'], False)
        identifiers = determine_authority_identifiers(page)
        if 'VIAF' in identifiers:
            if not isinstance(identifiers['VIAF'], basestring):
                colored_out(bcolors.FAIL, ' * multiple VIAF found: %s' % ', '.join(identifiers['VIAF']))
            elif identifiers['VIAF'] == '' or identifiers['VIAF'] is None:
                colored_out(bcolors.FAIL, ' * invalid empty VIAF found')
            else:
                viaf_url = 'http://viaf.org/viaf/%s' % identifiers['VIAF']
                edit_note = 'From %s' % (artist['wp_url'],)
                colored_out(bcolors.OKGREEN, ' * found VIAF:', viaf_url)
                # Check if this VIAF has not been deleted
                skip = False
                try:
                    resp, content = httplib2.Http().request(viaf_url)
                except socket.error:
                    colored_out(bcolors.FAIL, ' * timeout!')
                    skip = True
                deleted_message = 'abandonedViafRecord'
                if skip == False and (resp.status == '404' or deleted_message in content):
                    colored_out(bcolors.FAIL, ' * deleted VIAF!')
                    skip = True
                if skip == False:
                    time.sleep(3)
                    out(' * edit note:', edit_note.replace('\n', ' '))
                    mb.add_url('artist', artist['gid'], str(VIAF_RELATIONSHIP_TYPES['artist']), viaf_url, edit_note)
                    matched.add(artist['gid'])

        if artist['processed'] is None and artist['gid'] not in seen:
            db.execute("INSERT INTO bot_wp_artist_viaf (gid, lang) VALUES (%s, %s)", (artist['gid'], page.lang))
        else:
            db.execute("UPDATE bot_wp_artist_viaf SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], page.lang))
        seen.add(artist['gid'])
예제 #4
0
def main(ENTITY_TYPE):

    entity_type_table = ENTITY_TYPE.replace('-', '_')
    url_relationship_table = 'l_%s_url' % entity_type_table if ENTITY_TYPE != 'work' else 'l_url_%s' % entity_type_table
    main_entity_entity_point = "entity0" if ENTITY_TYPE != 'work' else "entity1"
    url_entity_point = "entity1" if ENTITY_TYPE != 'work' else "entity0"

    query = """
    WITH
        entities_wo_wikidata AS (
            SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url, substring(u.url from '//(([a-z]|-)+)\\.') as wp_lang
            FROM """ + entity_type_table + """ e
                JOIN """ + url_relationship_table + """ l ON l.""" + main_entity_entity_point + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """ + str(
        WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """)
                JOIN url u ON u.id = l.""" + url_entity_point + """ AND u.url ~ '^https?://[a-z-]+\.wikipedia\.org/wiki/'
            WHERE
                /* No existing Wikidata relationship for this entity */
                NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + main_entity_entity_point + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """ + str(
            WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]) + """))
                /* WP link should only be linked to this entity */
                AND NOT EXISTS (SELECT 1 FROM """ + url_relationship_table + """ ol WHERE ol.""" + url_entity_point + """ = u.id AND ol.""" + main_entity_entity_point + """ <> e.id)
                AND l.edits_pending = 0
        )
    SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed
    FROM entities_wo_wikidata ewf
    JOIN """ + entity_type_table + """ e ON ewf.entity_id = e.id
    LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = ewf.wp_lang
    ORDER BY b.processed NULLS FIRST, e.id
    LIMIT 500
    """

    seen = set()
    matched = set()
    for entity in db.execute(query):
        if entity['gid'] in matched:
            continue

        colored_out(
            bcolors.OKBLUE,
            'Looking up entity "%s" http://musicbrainz.org/%s/%s' %
            (entity['name'], ENTITY_TYPE, entity['gid']))
        out(' * wiki:', entity['wp_url'])

        page = WikiPage.fetch(entity['wp_url'], False)
        if page.wikidata_id:
            wikidata_url = 'https://www.wikidata.org/wiki/%s' % page.wikidata_id.upper(
            )
            edit_note = 'From %s' % (entity['wp_url'], )
            colored_out(bcolors.OKGREEN, ' * found Wikidata identifier:',
                        wikidata_url)
            time.sleep(1)
            out(' * edit note:', edit_note.replace('\n', ' '))
            mb.add_url(ENTITY_TYPE.replace('-', '_'), entity['gid'],
                       str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]),
                       wikidata_url, edit_note, True)
            matched.add(entity['gid'])

        if entity['processed'] is None and entity['gid'] not in seen:
            db.execute(
                "INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)",
                (entity['gid'], page.lang))
        else:
            db.execute(
                "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)",
                (entity['gid'], page.lang))
        seen.add(entity['gid'])
    stats['seen'][ENTITY_TYPE] = len(seen)
    stats['matched'][ENTITY_TYPE] = len(matched)
예제 #5
0
category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]')
category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]')

for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute(query, query_params):
    colored_out(bcolors.OKBLUE, 'Looking up release group "%s" https://musicbrainz.org/release-group/%s' % (rg_name, rg_gid))
    matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results
    last_wp_request = time.time()
    for match in matches:
        title = match['name']
        if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(rg_name) and mangle_name(title) != mangle_name(rg_name):
            continue
        delay = time.time() - last_wp_request
        if delay < 1.0:
            time.sleep(1.0 - delay)
        last_wp_request = time.time()
        wikipage = WikiPage.fetch('https://%s.wikipedia.org/wiki/%s' % (wp_lang, title))
        page_orig = wikipage.text
        if not page_orig:
            continue
        page_title = title
        colored_out(bcolors.HEADER, ' * trying article %s' % (title,))
        page = mangle_name(page_orig)

        is_canonical, reason = wp_is_canonical_page(title, page_orig)
        if (not is_canonical):
            out(' * %s, skipping' % reason)
            continue

        categories = category_re[wp_lang].findall(page_orig)
        is_album_page = False
        for category in categories:
예제 #6
0
                        qf='name',
                        rows=50).results
    last_wp_request = time.time()
    for match in matches:
        title = match['name']
        if title.endswith('album)') or title.endswith('song)'):
            continue
        if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(
                artist['name']) and mangle_name(title) != mangle_name(
                    artist['name']):
            continue
        delay = time.time() - last_wp_request
        if delay < 1.0:
            time.sleep(1.0 - delay)
        last_wp_request = time.time()
        wikipage = WikiPage.fetch('http://%s.wikipedia.org/wiki/%s' %
                                  (wp_lang, title))
        page_orig = wikipage.text
        if not page_orig:
            continue
        out(' * trying article "%s"' % (title, ))
        page = mangle_name(page_orig)

        is_canonical, reason = wp_is_canonical_page(title, page_orig)
        if (not is_canonical):
            out(' * %s, skipping' % reason)
            continue
        if 'infoboxalbum' in page:
            out(' * album page, skipping')
            continue
        page_title = title
예제 #7
0
def main(ENTITY_TYPE):

    entity_type_table = ENTITY_TYPE.replace("-", "_")
    url_relationship_table = "l_%s_url" % entity_type_table if ENTITY_TYPE != "work" else "l_url_%s" % entity_type_table
    main_entity_entity_point = "entity0" if ENTITY_TYPE != "work" else "entity1"
    url_entity_point = "entity1" if ENTITY_TYPE != "work" else "entity0"

    query = (
        """
    WITH
        entities_wo_wikidata AS (
            SELECT DISTINCT e.id AS entity_id, e.gid AS entity_gid, u.url AS wp_url
            FROM """
        + entity_type_table
        + """ e
                JOIN """
        + url_relationship_table
        + """ l ON l."""
        + main_entity_entity_point
        + """ = e.id AND l.link IN (SELECT id FROM link WHERE link_type = """
        + str(WIKIPEDIA_RELATIONSHIP_TYPES[ENTITY_TYPE])
        + """)
                JOIN url u ON u.id = l."""
        + url_entity_point
        + """ AND u.url LIKE 'http://%%.wikipedia.org/wiki/%%' AND substring(u.url from 8 for 2) IN ('en', 'fr')
            WHERE 
                /* No existing WikiData relationship for this entity */
                NOT EXISTS (SELECT 1 FROM """
        + url_relationship_table
        + """ ol WHERE ol."""
        + main_entity_entity_point
        + """ = e.id AND ol.link IN (SELECT id FROM link WHERE link_type = """
        + str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE])
        + """))
                /* WP link should only be linked to this entity */
                AND NOT EXISTS (SELECT 1 FROM """
        + url_relationship_table
        + """ ol WHERE ol."""
        + url_entity_point
        + """ = u.id AND ol."""
        + main_entity_entity_point
        + """ <> e.id)
                AND l.edits_pending = 0
        )
    SELECT e.id, e.gid, e.name, ewf.wp_url, b.processed
    FROM entities_wo_wikidata ewf
    JOIN s_"""
        + entity_type_table
        + """ e ON ewf.entity_id = e.id
    LEFT JOIN bot_wp_wikidata_links b ON e.gid = b.gid AND b.lang = substring(ewf.wp_url from 8 for 2)
    ORDER BY b.processed NULLS FIRST, e.id
    LIMIT 250
    """
    )

    seen = set()
    matched = set()
    for entity in db.execute(query):
        if entity["gid"] in matched:
            continue

        colored_out(
            bcolors.OKBLUE,
            'Looking up entity "%s" http://musicbrainz.org/%s/%s' % (entity["name"], ENTITY_TYPE, entity["gid"]),
        )
        out(" * wiki:", entity["wp_url"])

        page = WikiPage.fetch(entity["wp_url"], False)
        if page.wikidata_id:
            wikidata_url = "http://www.wikidata.org/wiki/%s" % page.wikidata_id.upper()
            edit_note = "From %s" % (entity["wp_url"],)
            colored_out(bcolors.OKGREEN, " * found WikiData identifier:", wikidata_url)
            time.sleep(3)
            out(" * edit note:", edit_note.replace("\n", " "))
            mb.add_url(
                ENTITY_TYPE.replace("-", "_"),
                entity["gid"],
                str(WIKIDATA_RELATIONSHIP_TYPES[ENTITY_TYPE]),
                wikidata_url,
                edit_note,
                True,
            )
            matched.add(entity["gid"])

        if entity["processed"] is None and entity["gid"] not in seen:
            db.execute("INSERT INTO bot_wp_wikidata_links (gid, lang) VALUES (%s, %s)", (entity["gid"], page.lang))
        else:
            db.execute(
                "UPDATE bot_wp_wikidata_links SET processed = now() WHERE (gid, lang) = (%s, %s)",
                (entity["gid"], page.lang),
            )
        seen.add(entity["gid"])
예제 #8
0
def main():
    seen = set()
    for artist in db.execute(query):
        if artist['id'] in seen:
            continue
        seen.add(artist['id'])
        colored_out(
            bcolors.OKBLUE,
            'Looking up artist "%s" http://musicbrainz.org/artist/%s' %
            (artist['name'], artist['gid']))
        out(' * wiki:', artist['url'])

        artist = dict(artist)
        update = set()
        reasons = []

        page = WikiPage.fetch(artist['url'], False)

        if not artist['area']:
            country, country_reasons = determine_country(page)
            if country:
                country_id = country_ids[country]
                artist['area'] = country_id
                update.add('area')
                reasons.append(('COUNTRY', country_reasons))

        if not artist['type']:
            type, type_reasons = determine_type(page)
            if type:
                type_id = artist_type_ids[type]
                artist['type'] = type_id
                update.add('type')
                reasons.append(('TYPE', type_reasons))

        if not artist['gender'] and artist['type'] == 1:
            gender, gender_reasons = determine_gender(page)
            if gender:
                gender_id = gender_ids[gender]
                artist['gender'] = gender_id
                update.add('gender')
                reasons.append(('GENDER', gender_reasons))

        is_performance_name = False
        if artist['type'] == 1 and CHECK_PERFORMANCE_NAME:
            is_performance_name = db.execute(performance_name_query,
                                             artist['id']).scalar() > 0
            out(" * checking for performance name", is_performance_name)

        if not artist['begin_date_year']:
            begin_date, begin_date_reasons = determine_begin_date(
                artist, page, is_performance_name)
            if begin_date['year']:
                colored_out(bcolors.OKGREEN, " * new begin date:", begin_date)
                artist['begin_date_year'] = begin_date['year']
                artist['begin_date_month'] = begin_date['month']
                artist['begin_date_day'] = begin_date['day']
                update.add('begin_date')
                reasons.append(('BEGIN DATE', begin_date_reasons))
        if not artist['end_date_year']:
            end_date, end_date_reasons = determine_end_date(
                artist, page, is_performance_name)
            if end_date['year']:
                colored_out(bcolors.OKGREEN, " * new end date:", end_date)
                artist['end_date_year'] = end_date['year']
                artist['end_date_month'] = end_date['month']
                artist['end_date_day'] = end_date['day']
                update.add('end_date')
                reasons.append(('END DATE', end_date_reasons))

        if update:
            edit_note = 'From %s' % (artist['url'], )
            for field, reason in reasons:
                edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason))
            out(' * edit note:', edit_note.replace('\n', ' '))
            time.sleep(10)
            mb.edit_artist(artist, update, edit_note)

        if artist['processed'] is None:
            db.execute(
                "INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)",
                (artist['gid'], wp_lang))
        else:
            db.execute(
                "UPDATE bot_wp_artist_data SET processed = now() WHERE (gid, lang) = (%s, %s)",
                (artist['gid'], wp_lang))
예제 #9
0
def main():
    seen = set()
    for artist in db.execute(query):
        if artist['id'] in seen:
            continue
        seen.add(artist['id'])
        colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid']))
        out(' * wiki:', artist['url'])

        artist = dict(artist)
        update = set()
        reasons = []

        page = WikiPage.fetch(artist['url'])

        if not artist['country']:
            country, country_reasons = determine_country(page)
            if country:
                country_id = country_ids[country]
                artist['country'] = country_id
                update.add('country')
                reasons.append(('COUNTRY', country_reasons))

        if not artist['type']:
            type, type_reasons = determine_type(page)
            if type:
                type_id = artist_type_ids[type]
                artist['type'] = type_id
                update.add('type')
                reasons.append(('TYPE', type_reasons))

        if not artist['gender'] and artist['type'] == 1:
            gender, gender_reasons = determine_gender(page)
            if gender:
                gender_id = gender_ids[gender]
                artist['gender'] = gender_id
                update.add('gender')
                reasons.append(('GENDER', gender_reasons))

        is_performance_name = False
        if artist['type'] == 1 and CHECK_PERFORMANCE_NAME:
            is_performance_name = db.execute(performance_name_query, artist['id']).scalar() > 0
            out(" * checking for performance name", is_performance_name)

        if not artist['begin_date_year']:
            begin_date, begin_date_reasons = determine_begin_date(artist, page, is_performance_name)
            if begin_date['year']:
                colored_out(bcolors.OKGREEN, " * new begin date:", begin_date)
                artist['begin_date_year'] = begin_date['year']
                artist['begin_date_month'] = begin_date['month']
                artist['begin_date_day'] = begin_date['day']
                update.add('begin_date')
                reasons.append(('BEGIN DATE', begin_date_reasons))
        if not artist['end_date_year']:
            end_date, end_date_reasons = determine_end_date(artist, page, is_performance_name)
            if end_date['year']:
                colored_out(bcolors.OKGREEN, " * new end date:", end_date)
                artist['end_date_year'] = end_date['year']
                artist['end_date_month'] = end_date['month']
                artist['end_date_day'] = end_date['day']
                update.add('end_date')
                reasons.append(('END DATE', end_date_reasons))

        if update:
            edit_note = 'From %s' % (artist['url'],)
            for field, reason in reasons:
                edit_note += '\n\n%s:\n%s' % (field, ' '.join(reason))
            out(' * edit note:', edit_note.replace('\n', ' '))
            time.sleep(10)
            mb.edit_artist(artist, update, edit_note)

        db.execute("INSERT INTO bot_wp_artist_data (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang))
        out()