def baseline(source, target, target_type, strategy, output_dir): """Rule-based matching strategies. SOURCE must be {string: identifier} JSON files. NOTICE: not all the entity types are available for all the targets Available strategies are: 'perfect' = perfect strings; 'links' = similar links; 'names' = similar names. Run all of them by default. """ # TODO source should be a stream from wikidata source_dataset = json.load(source) LOGGER.info("Loaded source dataset '%s'", source.name) target_entity = target_database.get_entity(target, target_type) target_link_entity = target_database.get_link_entity(target, target_type) if strategy == 'perfect': _perfect_name_wrapper(source_dataset, target_entity, output_dir) elif strategy == 'links': _similar_links_wrapper(source_dataset, target_link_entity, output_dir) elif strategy == 'names': _similar_names_wrapper(source_dataset, target_entity, output_dir) elif strategy == 'edit_distance': # TODO create a command only for this matching technique to expose the edit distance function too edit_distance_match(source_dataset, target_entity, 'jw', 0) elif strategy == 'all': LOGGER.info('Will run all the baseline strategies') _perfect_name_wrapper(source_dataset, target_entity, output_dir) _similar_names_wrapper(source_dataset, target_entity, output_dir) _similar_links_wrapper(source_dataset, target_link_entity, output_dir)
def extract_cli(catalog, entity, upload, sandbox, dir_io): """Extract Wikidata links from a target catalog dump.""" db_entity = target_database.get_link_entity(catalog, entity) if db_entity is None: LOGGER.info( 'No links available for %s %s. Stopping extraction here', catalog, entity, ) sys.exit(1) result_path = os.path.join( dir_io, constants.EXTRACTED_LINKS.format(catalog, entity)) os.makedirs(os.path.dirname(result_path), exist_ok=True) LOGGER.info( 'Starting extraction of Wikidata links available in %s %s ...', catalog, entity, ) _handle_result( _extract_existing_links( db_entity, target_database.get_catalog_pid(catalog, entity)), 'Wikidata links', catalog, result_path, upload, sandbox, )
def build_target( goal: str, catalog: str, entity: str, identifiers: Set[str] ) -> Iterator[pd.DataFrame]: """Build a target catalog dataset for training or classification purposes: workflow step 1. Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p`` database. This is where the :mod:`importer` inserts processed catalog dumps. The database is located in `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_ under the Wikimedia `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure. See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_. :param goal: ``{'training', 'classification'}``. Whether to build a dataset for training or classification :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param identifiers: a set of catalog IDs to gather data for :return: the generator yielding :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity) # Target catalog ORM entities/DB tables base, link, nlp = ( target_database.get_main_entity(catalog, entity), target_database.get_link_entity(catalog, entity), target_database.get_nlp_entity(catalog, entity), ) tables = [table for table in (base, link, nlp) if table] # Initial query with all non-null tables query = Query(tables) # Remove `base` to avoid outer join with itself tables.remove(base) # Outer joins for table in tables: query = query.outerjoin(table, base.catalog_id == table.catalog_id) # Condition query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads( False ) sql = query.statement LOGGER.debug('SQL query to be fired: %s', sql) # Avoid loading query result in memory db_engine = DBManager().get_engine().execution_options(stream_results=True) return read_sql(sql, db_engine, chunksize=1000)
def check_links_cli(catalog: str): """ Check for rotten URLs of an imported catalog. :param catalog: one of the keys of constants.TARGET_CATALOGS """ for entity_type in target_database.supported_entities_for_target(catalog): LOGGER.info("Validating %s %s links...", catalog, entity_type) entity = target_database.get_link_entity(catalog, entity_type) if not entity: LOGGER.info( "%s %s does not have a links table. Skipping...", catalog, entity_type, ) continue session = DBManager.connect_to_db() total = session.query(entity).count() removed = 0 with Pool() as pool: # Validate each link for resolved, res_entity in tqdm( pool.imap_unordered(_resolve_url, session.query(entity)), total=total, ): if not resolved: session_delete = DBManager.connect_to_db() # if not valid delete session_delete.delete(res_entity) try: session_delete.commit() removed += 1 except: session.rollback() raise finally: session_delete.close() session.close() LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog, entity_type)
def gather_target_links(entity, catalog): LOGGER.info('Gathering %s %s links ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) # Early return when the links table doesn't exist if link_entity is None: LOGGER.warning( 'No links table available in the database for %s %s. ' 'Stopping validation here', catalog, entity, ) return None session = DBManager.connect_to_db() result = None try: query = session.query(link_entity.catalog_id, link_entity.url) count = query.count() # Early return when no links if count == 0: LOGGER.warning( 'No links available for %s %s. Stopping validation here', catalog, entity, ) return None LOGGER.info('Got %d links from %s %s', count, catalog, entity) result = query.all() session.commit() except: session.rollback() raise finally: session.close() if result is None: return None for row in result: yield row.catalog_id, row.url
def check_urls_cli(catalog, drop, dir_io): """Check for rotten URLs of an imported catalog. For every catalog entity, dump rotten URLs to a file. CSV format: URL,catalog_ID Use '-d' to drop rotten URLs from the DB on the fly. """ for entity in target_database.supported_entities_for_target(catalog): out_path = os.path.join( dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity) ) LOGGER.info('Starting check of %s %s URLs ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) if not link_entity: LOGGER.info( '%s %s does not have a links table. Skipping ...', catalog, entity, ) continue query_session = DBManager.connect_to_db() total = query_session.query(link_entity).count() rotten = 0 if drop: removed = 0 # Parallel operation with Pool() as pool, open(out_path, 'w', buffering=1) as fout: writer = csv.writer(fout) try: # Resolve every URL for resolved, result in tqdm( pool.imap_unordered(_resolve, query_session.query(link_entity)), total=total, ): if not resolved: # Dump writer.writerow((result.url, result.catalog_id)) rotten += 1 # Drop from DB if drop: delete_session = DBManager.connect_to_db() delete_session.delete(result) try: delete_session.commit() removed += 1 except SQLAlchemyError as error: LOGGER.error( 'Failed deletion of %s: %s', result, error.__class__.__name__, ) LOGGER.debug(error) delete_session.rollback() finally: delete_session.close() except SQLAlchemyError as error: LOGGER.error( '%s while querying %s %s URLs', error.__class__.__name__, catalog, entity, ) LOGGER.debug(error) session.rollback() finally: query_session.close() LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info()) LOGGER.info( "Total %s %s rotten URLs dumped to '%s': %d / %d", catalog, entity, out_path, rotten, total, ) if drop: LOGGER.info( 'Total %s %s rotten URLs dropped from the DB: %d / %d', catalog, entity, rotten, removed, )
def _run(catalog, entity, rule, check_dates, upload, sandbox, dir_io): wd_io_path = os.path.join( dir_io, constants.WD_CLASSIFICATION_SET.format(catalog, entity)) base_entity = target_database.get_main_entity(catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) if rule == 'links' and link_entity is None: LOGGER.warning( "No links available for %s %s. Stopping baseline here ...", catalog, entity, ) return pid = target_database.get_catalog_pid(catalog, entity) with gzip.open(wd_io_path, 'rt') as wd_io: if rule in ('perfect', 'all'): wd_io.seek(0) LOGGER.info('Starting perfect names linker ...') result = _perfect_names_linker(wd_io, base_entity, pid, check_dates) perfect_path = os.path.join( dir_io, constants.BASELINE_PERFECT.format(catalog, entity)) os.makedirs(os.path.dirname(perfect_path), exist_ok=True) _handle_result(result, rule, catalog, perfect_path, upload, sandbox) if rule == 'all' and link_entity is None: LOGGER.warning( "No links available for %s %s. Won't run the 'links' rule ...", catalog, entity, ) if rule in ('links', 'all') and link_entity is not None: wd_io.seek(0) LOGGER.info('Starting similar link tokens linker ...') result = _similar_tokens_linker( wd_io, link_entity, (keys.URL, keys.URL_TOKENS), pid, False, url_utils.tokenize, ) links_path = os.path.join( dir_io, constants.BASELINE_LINKS.format(catalog, entity)) os.makedirs(os.path.dirname(links_path), exist_ok=True) _handle_result(result, rule, catalog, links_path, upload, sandbox) if rule in ('names', 'all'): wd_io.seek(0) LOGGER.info('Starting similar name tokens linker ...') result = _similar_tokens_linker( wd_io, base_entity, (keys.NAME, keys.NAME_TOKENS), pid, check_dates, text_utils.tokenize, ) names_path = os.path.join( dir_io, constants.BASELINE_NAMES.format(catalog, entity)) os.makedirs(os.path.dirname(names_path), exist_ok=True) _handle_result(result, rule, catalog, names_path, upload, sandbox)