예제 #1
0
파일: checks.py 프로젝트: fossabot/soweego
def check_links(entity, catalog, wikidata_cache=None):
    catalog_terms = _get_vocabulary(catalog)

    # Target links
    target = gather_target_links(entity, catalog)
    # Early stop in case of no target links
    if target is None:
        return None, None, None

    to_deprecate = defaultdict(set)
    to_add = defaultdict(set)

    if wikidata_cache is None:
        wikidata = {}

        # Wikidata links
        gather_identifiers(entity, catalog, catalog_terms['pid'], wikidata)
        url_pids, ext_id_pids_to_urls = gather_relevant_pids()
        gather_wikidata_links(wikidata, url_pids, ext_id_pids_to_urls)
    else:
        wikidata = wikidata_cache

    # Check
    _assess('links', wikidata, target, to_deprecate, to_add)

    # Separate external IDs from URLs
    ext_ids_to_add, urls_to_add = extract_ids_from_urls(
        to_add, ext_id_pids_to_urls)

    LOGGER.info(
        'Validation completed. %d %s IDs to be deprecated, %d external IDs to be added, %d URL statements to be added',
        len(to_deprecate), catalog, len(ext_ids_to_add), len(urls_to_add))

    return to_deprecate, ext_ids_to_add, urls_to_add, wikidata
예제 #2
0
def links(catalog: str,
          entity: str,
          wd_cache=None) -> Tuple[DefaultDict, List, List, Dict]:
    """Validate identifiers against available links.

    Also generate statements based on additional links
    found in the given catalog.
    They can be used to enrich Wikidata items.

    **How it works:**

    1. gather links from the given catalog
    2. gather links from relevant Wikidata items
    3. look for shared links between pairs of Wikidata and catalog items:

      - when the pair does not share any link,
        the catalog identifier should be marked with a deprecated rank
      - when the catalog item has more links than the Wikidata one,
        they should be added to the latter

    4. try to extract third-party identifiers from extra links

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
      in a previous run
    :return: 4 objects

      1. ``dict`` of identifiers that should be deprecated
      2. ``list`` of third-party identifiers that should be added
      3. ``list`` of URLs that should be added
      4. ``dict`` of links gathered from Wikidata

    """
    # Target catalog side first:
    # enable early return in case of no target links
    target_links = data_gathering.gather_target_links(entity, catalog)
    if target_links is None:
        return None, None, None, None

    to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set)

    # Wikidata side
    url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
    if wd_cache is None:
        wd_links = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_links,
        )
        data_gathering.gather_wikidata_links(wd_links, url_pids,
                                             ext_id_pids_to_urls)
    else:
        wd_links = wd_cache

    # Validation
    _validate(keys.LINKS, wd_links, target_links, to_be_deprecated,
              to_be_added)

    # Separate external IDs from URLs
    ext_ids_to_be_added, urls_to_be_added = data_gathering.extract_ids_from_urls(
        to_be_added, ext_id_pids_to_urls)

    LOGGER.info(
        'Validation completed. Target: %s %s. '
        'IDs to be deprecated: %d. '
        'Third-party IDs to be added: %d. '
        'URL statements to be added: %d',
        catalog,
        entity,
        len(to_be_deprecated),
        len(ext_ids_to_be_added),
        len(urls_to_be_added),
    )

    return to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, wd_links
예제 #3
0
def build_wikidata(goal: str, catalog: str, entity: str,
                   dir_io: str) -> JsonReader:
    """Build a Wikidata dataset for training or classification purposes:
    workflow step 1.

    Data is gathered from the
    `SPARQL endpoint <https://query.wikidata.org/>`_ and the
    `Web API <https://www.wikidata.org/w/api.php>`_.

    **How it works:**

    1. gather relevant Wikidata items that *hold* (for *training*)
       or *lack* (for *classification*) identifiers of the given catalog
    2. gather relevant item data
    3. dump the dataset to a gzipped `JSON Lines <http://jsonlines.org/>`_ file
    4. read the dataset into a generator of :class:`pandas.DataFrame` chunks
       for memory-efficient processing

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    qids_and_tids, wd_io_path = _handle_goal(goal, catalog, entity, dir_io)
    catalog_pid = target_database.get_catalog_pid(catalog, entity)

    if not os.path.isfile(wd_io_path):
        LOGGER.info(
            "Building Wikidata %s set for %s %s, output file '%s' ...",
            goal,
            catalog,
            entity,
            wd_io_path,
        )

        # Make working folders
        os.makedirs(os.path.dirname(wd_io_path), exist_ok=True)

        # 1. Gather Wikidata items
        if goal == 'training':
            # WITH target IDs
            data_gathering.gather_target_ids(entity, catalog, catalog_pid,
                                             qids_and_tids)
            qids = qids_and_tids.keys()

        elif goal == 'classification':
            # WITHOUT target IDs
            qids = data_gathering.gather_qids(entity, catalog, catalog_pid)

        # 2. Collect relevant data, and 3. dump to gzipped JSON Lines
        url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()

        with gzip.open(wd_io_path, 'wt') as wd_io:
            api_requests.get_data_for_linker(
                catalog,
                entity,
                qids,
                url_pids,
                ext_id_pids_to_urls,
                qids_and_tids,
                wd_io,
            )

    # Cached dataset, for development purposes
    else:
        LOGGER.info("Will reuse existing Wikidata %s set: '%s'", goal,
                    wd_io_path)
        if goal == 'training':
            _reconstruct_qids_and_tids(wd_io_path, qids_and_tids)

    LOGGER.info('Wikidata %s set built', goal)

    return pd.read_json(wd_io_path, lines=True, chunksize=1000)
예제 #4
0
파일: checks.py 프로젝트: Wikidata/soweego
def links(
    catalog: str,
    entity: str,
    url_blacklist=False,
    wd_cache=None
) -> Optional[Tuple[defaultdict, list, list, list, list, list, dict]]:
    """Validate identifiers against available links.

    Also generate statements based on additional links
    found in the target catalog.
    They can be used to enrich Wikidata items.

    **How it works:**

    1. gather links from the target catalog
    2. gather links from relevant Wikidata items
    3. look for shared links between pairs of Wikidata and catalog items:

      - when the pair does not share any link,
        the catalog identifier should be marked with a deprecated rank
      - when the catalog item has more links than the Wikidata one,
        they should be added to the latter

    4. try to extract third-party identifiers from extra links

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param url_blacklist: (optional) whether to apply a blacklist
      of URL domains. Default: ``False``
    :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
      in a previous run. Default: ``None``
    :return: 7 objects

      1. ``dict`` of identifiers that should be deprecated
      2. ``list`` of third-party identifiers that should be added
      3. ``list`` of URLs that should be added
      4. ``list`` of third-party identifiers that should be referenced
      5. ``list`` of URLs that should be referenced
      6. ``list`` of URLs found in Wikidata but not in the target catalog
      7. ``dict`` of links gathered from Wikidata

      or ``None`` if the target catalog has no links.

    """
    # Target catalog side first:
    # enable early return in case of no target links
    target_links = data_gathering.gather_target_links(entity, catalog)
    if target_links is None:
        return None

    deprecate, add = defaultdict(set), defaultdict(set)
    reference, wd_only = defaultdict(set), defaultdict(set)

    # Wikidata side
    url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
    if wd_cache is None:
        wd_links = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_links,
        )
        data_gathering.gather_wikidata_links(wd_links, url_pids,
                                             ext_id_pids_to_urls)
    else:
        wd_links = wd_cache

    # Validation
    _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference,
              wd_only)

    # URLs to be added:
    # 1. Separate external IDs from URLs
    add_ext_ids, add_urls = data_gathering.extract_ids_from_urls(
        add, ext_id_pids_to_urls)
    # 2. Apply URL blacklist
    if url_blacklist:
        add_urls = _apply_url_blacklist(add_urls)

    # URLs to be referenced: separate external IDs from URLs
    ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls(
        reference, ext_id_pids_to_urls)

    # Wikidata-only URLs: convert into a list of statements
    # with complete Wikidata item URLs
    wd_only_urls = []
    for (qid, tid), urls in wd_only.items():
        for url in urls:
            wd_only_urls.append((tid, url, QID_PREFIX + qid))

    LOGGER.info(
        'Validation completed. Target: %s %s. '
        'IDs to be deprecated: %d. '
        'Third-party IDs to be added: %d. '
        'URL statements to be added: %d. '
        'Third-party IDs to be referenced: %d. '
        'URL statements to be referenced: %d. '
        'URL in Wikidata but not in the target: %d',
        catalog,
        entity,
        len(deprecate),
        len(add_ext_ids),
        len(add_urls),
        len(ref_ext_ids),
        len(ref_urls),
        len(wd_only_urls),
    )

    return (
        deprecate,
        add_ext_ids,
        add_urls,
        ref_ext_ids,
        ref_urls,
        wd_only_urls,
        wd_links,
    )