Пример #1
0
def baseline(source, target, target_type, strategy, output_dir):
    """Rule-based matching strategies.

    SOURCE must be {string: identifier} JSON files.

    NOTICE: not all the entity types are available for all the targets

    Available strategies are:
    'perfect' = perfect strings;
    'links' = similar links;
    'names' = similar names.

    Run all of them by default.
    """

    # TODO source should be a stream from wikidata
    source_dataset = json.load(source)
    LOGGER.info("Loaded source dataset '%s'", source.name)
    target_entity = target_database.get_entity(target, target_type)
    target_link_entity = target_database.get_link_entity(target, target_type)
    if strategy == 'perfect':
        _perfect_name_wrapper(source_dataset, target_entity, output_dir)
    elif strategy == 'links':
        _similar_links_wrapper(source_dataset, target_link_entity, output_dir)
    elif strategy == 'names':
        _similar_names_wrapper(source_dataset, target_entity, output_dir)
    elif strategy == 'edit_distance':
        # TODO create a command only for this matching technique to expose the edit distance function too
        edit_distance_match(source_dataset, target_entity, 'jw', 0)
    elif strategy == 'all':
        LOGGER.info('Will run all the baseline strategies')
        _perfect_name_wrapper(source_dataset, target_entity, output_dir)
        _similar_names_wrapper(source_dataset, target_entity, output_dir)
        _similar_links_wrapper(source_dataset, target_link_entity, output_dir)
Пример #2
0
def extract_cli(catalog, entity, upload, sandbox, dir_io):
    """Extract Wikidata links from a target catalog dump."""
    db_entity = target_database.get_link_entity(catalog, entity)

    if db_entity is None:
        LOGGER.info(
            'No links available for %s %s. Stopping extraction here',
            catalog,
            entity,
        )
        sys.exit(1)

    result_path = os.path.join(
        dir_io, constants.EXTRACTED_LINKS.format(catalog, entity))
    os.makedirs(os.path.dirname(result_path), exist_ok=True)

    LOGGER.info(
        'Starting extraction of Wikidata links available in %s %s ...',
        catalog,
        entity,
    )

    _handle_result(
        _extract_existing_links(
            db_entity, target_database.get_catalog_pid(catalog, entity)),
        'Wikidata links',
        catalog,
        result_path,
        upload,
        sandbox,
    )
Пример #3
0
def build_target(
    goal: str, catalog: str, entity: str, identifiers: Set[str]
) -> Iterator[pd.DataFrame]:
    """Build a target catalog dataset for training or classification purposes:
    workflow step 1.

    Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p``
    database. This is where the :mod:`importer` inserts processed catalog dumps.

    The database is located in
    `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_
    under the Wikimedia
    `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure.
    See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_.

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param identifiers: a set of catalog IDs to gather data for
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity)

    # Target catalog ORM entities/DB tables
    base, link, nlp = (
        target_database.get_main_entity(catalog, entity),
        target_database.get_link_entity(catalog, entity),
        target_database.get_nlp_entity(catalog, entity),
    )
    tables = [table for table in (base, link, nlp) if table]

    # Initial query with all non-null tables
    query = Query(tables)
    # Remove `base` to avoid outer join with itself
    tables.remove(base)
    # Outer joins
    for table in tables:
        query = query.outerjoin(table, base.catalog_id == table.catalog_id)
    # Condition
    query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(
        False
    )

    sql = query.statement
    LOGGER.debug('SQL query to be fired: %s', sql)

    # Avoid loading query result in memory
    db_engine = DBManager().get_engine().execution_options(stream_results=True)

    return read_sql(sql, db_engine, chunksize=1000)
Пример #4
0
def check_links_cli(catalog: str):
    """
    Check for rotten URLs of an imported catalog.

    :param catalog: one of the keys of constants.TARGET_CATALOGS
    """
    for entity_type in target_database.supported_entities_for_target(catalog):

        LOGGER.info("Validating %s %s links...", catalog, entity_type)
        entity = target_database.get_link_entity(catalog, entity_type)
        if not entity:
            LOGGER.info(
                "%s %s does not have a links table. Skipping...",
                catalog,
                entity_type,
            )
            continue

        session = DBManager.connect_to_db()
        total = session.query(entity).count()
        removed = 0

        with Pool() as pool:
            # Validate each link
            for resolved, res_entity in tqdm(
                    pool.imap_unordered(_resolve_url, session.query(entity)),
                    total=total,
            ):
                if not resolved:
                    session_delete = DBManager.connect_to_db()
                    # if not valid delete
                    session_delete.delete(res_entity)
                    try:
                        session_delete.commit()
                        removed += 1
                    except:
                        session.rollback()
                        raise
                    finally:
                        session_delete.close()

        session.close()
        LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog,
                    entity_type)
Пример #5
0
def gather_target_links(entity, catalog):
    LOGGER.info('Gathering %s %s links ...', catalog, entity)
    link_entity = target_database.get_link_entity(catalog, entity)

    # Early return when the links table doesn't exist
    if link_entity is None:
        LOGGER.warning(
            'No links table available in the database for %s %s. '
            'Stopping validation here',
            catalog,
            entity,
        )
        return None

    session = DBManager.connect_to_db()
    result = None
    try:
        query = session.query(link_entity.catalog_id, link_entity.url)
        count = query.count()
        # Early return when no links
        if count == 0:
            LOGGER.warning(
                'No links available for %s %s. Stopping validation here',
                catalog,
                entity,
            )
            return None
        LOGGER.info('Got %d links from %s %s', count, catalog, entity)
        result = query.all()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if result is None:
        return None
    for row in result:
        yield row.catalog_id, row.url
Пример #6
0
def check_urls_cli(catalog, drop, dir_io):
    """Check for rotten URLs of an imported catalog.

    For every catalog entity, dump rotten URLs to a file.
    CSV format: URL,catalog_ID

    Use '-d' to drop rotten URLs from the DB on the fly.
    """
    for entity in target_database.supported_entities_for_target(catalog):
        out_path = os.path.join(
            dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity)
        )

        LOGGER.info('Starting check of %s %s URLs ...', catalog, entity)
        link_entity = target_database.get_link_entity(catalog, entity)
        if not link_entity:
            LOGGER.info(
                '%s %s does not have a links table. Skipping ...',
                catalog,
                entity,
            )
            continue

        query_session = DBManager.connect_to_db()
        total = query_session.query(link_entity).count()

        rotten = 0
        if drop:
            removed = 0

        # Parallel operation
        with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
            writer = csv.writer(fout)
            try:
                # Resolve every URL
                for resolved, result in tqdm(
                    pool.imap_unordered(_resolve, query_session.query(link_entity)),
                    total=total,
                ):
                    if not resolved:
                        # Dump
                        writer.writerow((result.url, result.catalog_id))
                        rotten += 1

                        # Drop from DB
                        if drop:
                            delete_session = DBManager.connect_to_db()
                            delete_session.delete(result)
                            try:
                                delete_session.commit()
                                removed += 1
                            except SQLAlchemyError as error:
                                LOGGER.error(
                                    'Failed deletion of %s: %s',
                                    result,
                                    error.__class__.__name__,
                                )
                                LOGGER.debug(error)
                                delete_session.rollback()
                            finally:
                                delete_session.close()
            except SQLAlchemyError as error:
                LOGGER.error(
                    '%s while querying %s %s URLs',
                    error.__class__.__name__,
                    catalog,
                    entity,
                )
                LOGGER.debug(error)
                session.rollback()
            finally:
                query_session.close()

        LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info())
        LOGGER.info(
            "Total %s %s rotten URLs dumped to '%s': %d / %d",
            catalog,
            entity,
            out_path,
            rotten,
            total,
        )

        if drop:
            LOGGER.info(
                'Total %s %s rotten URLs dropped from the DB: %d / %d',
                catalog,
                entity,
                rotten,
                removed,
            )
Пример #7
0
def _run(catalog, entity, rule, check_dates, upload, sandbox, dir_io):
    wd_io_path = os.path.join(
        dir_io, constants.WD_CLASSIFICATION_SET.format(catalog, entity))
    base_entity = target_database.get_main_entity(catalog, entity)
    link_entity = target_database.get_link_entity(catalog, entity)

    if rule == 'links' and link_entity is None:
        LOGGER.warning(
            "No links available for %s %s. Stopping baseline here ...",
            catalog,
            entity,
        )
        return

    pid = target_database.get_catalog_pid(catalog, entity)

    with gzip.open(wd_io_path, 'rt') as wd_io:
        if rule in ('perfect', 'all'):
            wd_io.seek(0)

            LOGGER.info('Starting perfect names linker ...')

            result = _perfect_names_linker(wd_io, base_entity, pid,
                                           check_dates)

            perfect_path = os.path.join(
                dir_io, constants.BASELINE_PERFECT.format(catalog, entity))
            os.makedirs(os.path.dirname(perfect_path), exist_ok=True)
            _handle_result(result, rule, catalog, perfect_path, upload,
                           sandbox)

        if rule == 'all' and link_entity is None:
            LOGGER.warning(
                "No links available for %s %s. Won't run the 'links' rule ...",
                catalog,
                entity,
            )

        if rule in ('links', 'all') and link_entity is not None:
            wd_io.seek(0)

            LOGGER.info('Starting similar link tokens linker ...')

            result = _similar_tokens_linker(
                wd_io,
                link_entity,
                (keys.URL, keys.URL_TOKENS),
                pid,
                False,
                url_utils.tokenize,
            )

            links_path = os.path.join(
                dir_io, constants.BASELINE_LINKS.format(catalog, entity))
            os.makedirs(os.path.dirname(links_path), exist_ok=True)
            _handle_result(result, rule, catalog, links_path, upload, sandbox)

        if rule in ('names', 'all'):
            wd_io.seek(0)

            LOGGER.info('Starting similar name tokens linker ...')

            result = _similar_tokens_linker(
                wd_io,
                base_entity,
                (keys.NAME, keys.NAME_TOKENS),
                pid,
                check_dates,
                text_utils.tokenize,
            )

            names_path = os.path.join(
                dir_io, constants.BASELINE_NAMES.format(catalog, entity))
            os.makedirs(os.path.dirname(names_path), exist_ok=True)
            _handle_result(result, rule, catalog, names_path, upload, sandbox)