示例#1
0
def _gather_wd_data(catalog, entity, works, people):
    # Works IDs
    data_gathering.gather_target_ids(
        target_database.get_work_type(catalog, entity),
        catalog,
        target_database.get_work_pid(catalog),
        works,
    )

    # People IDs
    data_gathering.gather_target_ids(entity, catalog,
                                     target_database.get_person_pid(catalog),
                                     people)
示例#2
0
def bio(catalog: str,
        entity: str,
        wd_cache=None) -> Tuple[DefaultDict, Iterator, Dict]:
    """Validate identifiers against available biographical data.

    Look for:

    - birth and death dates
    - birth and death places
    - gender

    Also generate statements based on additional data
    found in the given catalog.
    They can be used to enrich Wikidata items.

    **How it works:**

    1. gather data from the given catalog
    2. gather data from relevant Wikidata items
    3. look for shared data between pairs of Wikidata and catalog items:

      - when the pair does not share any data,
        the catalog identifier should be marked with a deprecated rank
      - when the catalog item has more data than the Wikidata one,
        it should be added to the latter

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
      in a previous run
    :return: 3 objects

      1. ``dict`` of identifiers that should be deprecated
      2. ``generator`` of statements that should be added
      3. ``dict`` of biographical data gathered from Wikidata

    """
    # Target catalog side first:
    # enable early return in case of no target data
    target_bio = data_gathering.gather_target_biodata(entity, catalog)
    if target_bio is None:
        return None, None, None

    to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set)

    # Wikidata side
    if wd_cache is None:
        wd_bio = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_bio,
        )
        data_gathering.gather_wikidata_biodata(wd_bio)
    else:
        wd_bio = wd_cache

    # Validation
    _validate(keys.BIODATA, wd_bio, target_bio, to_be_deprecated, to_be_added)

    return to_be_deprecated, _bio_to_be_added_generator(to_be_added), wd_bio
示例#3
0
def links(catalog: str,
          entity: str,
          wd_cache=None) -> Tuple[DefaultDict, List, List, Dict]:
    """Validate identifiers against available links.

    Also generate statements based on additional links
    found in the given catalog.
    They can be used to enrich Wikidata items.

    **How it works:**

    1. gather links from the given catalog
    2. gather links from relevant Wikidata items
    3. look for shared links between pairs of Wikidata and catalog items:

      - when the pair does not share any link,
        the catalog identifier should be marked with a deprecated rank
      - when the catalog item has more links than the Wikidata one,
        they should be added to the latter

    4. try to extract third-party identifiers from extra links

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
      in a previous run
    :return: 4 objects

      1. ``dict`` of identifiers that should be deprecated
      2. ``list`` of third-party identifiers that should be added
      3. ``list`` of URLs that should be added
      4. ``dict`` of links gathered from Wikidata

    """
    # Target catalog side first:
    # enable early return in case of no target links
    target_links = data_gathering.gather_target_links(entity, catalog)
    if target_links is None:
        return None, None, None, None

    to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set)

    # Wikidata side
    url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
    if wd_cache is None:
        wd_links = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_links,
        )
        data_gathering.gather_wikidata_links(wd_links, url_pids,
                                             ext_id_pids_to_urls)
    else:
        wd_links = wd_cache

    # Validation
    _validate(keys.LINKS, wd_links, target_links, to_be_deprecated,
              to_be_added)

    # Separate external IDs from URLs
    ext_ids_to_be_added, urls_to_be_added = data_gathering.extract_ids_from_urls(
        to_be_added, ext_id_pids_to_urls)

    LOGGER.info(
        'Validation completed. Target: %s %s. '
        'IDs to be deprecated: %d. '
        'Third-party IDs to be added: %d. '
        'URL statements to be added: %d',
        catalog,
        entity,
        len(to_be_deprecated),
        len(ext_ids_to_be_added),
        len(urls_to_be_added),
    )

    return to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, wd_links
示例#4
0
def dead_ids(catalog: str,
             entity: str,
             wd_cache=None) -> Tuple[DefaultDict, Dict]:
    """Look for dead identifiers in Wikidata.
    An identifier is dead if it does not exist in the given catalog
    when this function is executed.

    Dead identifiers should be marked with a deprecated rank in Wikidata.

    **How it works:**

    1. gather identifiers of the given catalog from relevant Wikidata items
    2. look them up in the given catalog
    3. if an identifier is not in the given catalog anymore,
       it should be deprecated

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata
      in a previous run
    :return: the ``dict`` pair of dead identifiers
      and identifiers gathered from Wikidata
    """
    dead = defaultdict(set)
    db_entity = target_database.get_main_entity(catalog, entity)

    # Wikidata side
    if wd_cache is None:
        wd_ids = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_ids,
        )
    else:
        wd_ids = wd_cache

    # Target catalog side
    session = DBManager.connect_to_db()

    try:
        for qid in wd_ids:
            for tid in wd_ids[qid][keys.TID]:
                existing = (session.query(
                    db_entity.catalog_id).filter_by(catalog_id=tid).count())
                if existing == 0:
                    LOGGER.debug('%s %s identifier %s is dead', qid, catalog,
                                 tid)
                    dead[tid].add(qid)
        session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of target catalog identifiers due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
    finally:
        session.close()

    LOGGER.info(
        'Check completed. Target: %s %s. Total dead identifiers: %d',
        catalog,
        entity,
        len(dead),
    )
    return dead, wd_ids
示例#5
0
def build_wikidata(goal: str, catalog: str, entity: str,
                   dir_io: str) -> JsonReader:
    """Build a Wikidata dataset for training or classification purposes:
    workflow step 1.

    Data is gathered from the
    `SPARQL endpoint <https://query.wikidata.org/>`_ and the
    `Web API <https://www.wikidata.org/w/api.php>`_.

    **How it works:**

    1. gather relevant Wikidata items that *hold* (for *training*)
       or *lack* (for *classification*) identifiers of the given catalog
    2. gather relevant item data
    3. dump the dataset to a gzipped `JSON Lines <http://jsonlines.org/>`_ file
    4. read the dataset into a generator of :class:`pandas.DataFrame` chunks
       for memory-efficient processing

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    qids_and_tids, wd_io_path = _handle_goal(goal, catalog, entity, dir_io)
    catalog_pid = target_database.get_catalog_pid(catalog, entity)

    if not os.path.isfile(wd_io_path):
        LOGGER.info(
            "Building Wikidata %s set for %s %s, output file '%s' ...",
            goal,
            catalog,
            entity,
            wd_io_path,
        )

        # Make working folders
        os.makedirs(os.path.dirname(wd_io_path), exist_ok=True)

        # 1. Gather Wikidata items
        if goal == 'training':
            # WITH target IDs
            data_gathering.gather_target_ids(entity, catalog, catalog_pid,
                                             qids_and_tids)
            qids = qids_and_tids.keys()

        elif goal == 'classification':
            # WITHOUT target IDs
            qids = data_gathering.gather_qids(entity, catalog, catalog_pid)

        # 2. Collect relevant data, and 3. dump to gzipped JSON Lines
        url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()

        with gzip.open(wd_io_path, 'wt') as wd_io:
            api_requests.get_data_for_linker(
                catalog,
                entity,
                qids,
                url_pids,
                ext_id_pids_to_urls,
                qids_and_tids,
                wd_io,
            )

    # Cached dataset, for development purposes
    else:
        LOGGER.info("Will reuse existing Wikidata %s set: '%s'", goal,
                    wd_io_path)
        if goal == 'training':
            _reconstruct_qids_and_tids(wd_io_path, qids_and_tids)

    LOGGER.info('Wikidata %s set built', goal)

    return pd.read_json(wd_io_path, lines=True, chunksize=1000)
示例#6
0
def links(
    catalog: str,
    entity: str,
    url_blacklist=False,
    wd_cache=None
) -> Optional[Tuple[defaultdict, list, list, list, list, list, dict]]:
    """Validate identifiers against available links.

    Also generate statements based on additional links
    found in the target catalog.
    They can be used to enrich Wikidata items.

    **How it works:**

    1. gather links from the target catalog
    2. gather links from relevant Wikidata items
    3. look for shared links between pairs of Wikidata and catalog items:

      - when the pair does not share any link,
        the catalog identifier should be marked with a deprecated rank
      - when the catalog item has more links than the Wikidata one,
        they should be added to the latter

    4. try to extract third-party identifiers from extra links

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param url_blacklist: (optional) whether to apply a blacklist
      of URL domains. Default: ``False``
    :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
      in a previous run. Default: ``None``
    :return: 7 objects

      1. ``dict`` of identifiers that should be deprecated
      2. ``list`` of third-party identifiers that should be added
      3. ``list`` of URLs that should be added
      4. ``list`` of third-party identifiers that should be referenced
      5. ``list`` of URLs that should be referenced
      6. ``list`` of URLs found in Wikidata but not in the target catalog
      7. ``dict`` of links gathered from Wikidata

      or ``None`` if the target catalog has no links.

    """
    # Target catalog side first:
    # enable early return in case of no target links
    target_links = data_gathering.gather_target_links(entity, catalog)
    if target_links is None:
        return None

    deprecate, add = defaultdict(set), defaultdict(set)
    reference, wd_only = defaultdict(set), defaultdict(set)

    # Wikidata side
    url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
    if wd_cache is None:
        wd_links = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_links,
        )
        data_gathering.gather_wikidata_links(wd_links, url_pids,
                                             ext_id_pids_to_urls)
    else:
        wd_links = wd_cache

    # Validation
    _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference,
              wd_only)

    # URLs to be added:
    # 1. Separate external IDs from URLs
    add_ext_ids, add_urls = data_gathering.extract_ids_from_urls(
        add, ext_id_pids_to_urls)
    # 2. Apply URL blacklist
    if url_blacklist:
        add_urls = _apply_url_blacklist(add_urls)

    # URLs to be referenced: separate external IDs from URLs
    ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls(
        reference, ext_id_pids_to_urls)

    # Wikidata-only URLs: convert into a list of statements
    # with complete Wikidata item URLs
    wd_only_urls = []
    for (qid, tid), urls in wd_only.items():
        for url in urls:
            wd_only_urls.append((tid, url, QID_PREFIX + qid))

    LOGGER.info(
        'Validation completed. Target: %s %s. '
        'IDs to be deprecated: %d. '
        'Third-party IDs to be added: %d. '
        'URL statements to be added: %d. '
        'Third-party IDs to be referenced: %d. '
        'URL statements to be referenced: %d. '
        'URL in Wikidata but not in the target: %d',
        catalog,
        entity,
        len(deprecate),
        len(add_ext_ids),
        len(add_urls),
        len(ref_ext_ids),
        len(ref_urls),
        len(wd_only_urls),
    )

    return (
        deprecate,
        add_ext_ids,
        add_urls,
        ref_ext_ids,
        ref_urls,
        wd_only_urls,
        wd_links,
    )