def gather_target_metadata(entity_type, catalog): catalog_constants = _get_catalog_constants(catalog) catalog_entity = _get_catalog_entity(entity_type, catalog_constants) LOGGER.info( 'Gathering %s birth/death dates/places and gender metadata ...', catalog) entity = catalog_entity['entity'] # Base metadata query_fields = _build_metadata_query_fields(entity, entity_type, catalog) session = DBManager.connect_to_db() result = None try: result = _run_metadata_query(session, query_fields, entity, catalog, entity_type) session.commit() except: session.rollback() raise finally: session.close() if not result: return None return _parse_target_metadata_query_result(result)
def gather_target_links(entity_type, catalog): catalog_constants = _get_catalog_constants(catalog) catalog_entity = _get_catalog_entity(entity_type, catalog_constants) LOGGER.info('Gathering %s %s links ...', catalog, entity_type) link_entity = catalog_entity['link_entity'] session = DBManager.connect_to_db() result = None try: query = session.query(link_entity.catalog_id, link_entity.url) count = query.count() if count == 0: LOGGER.warning( "No links available for %s %s. Stopping validation here", catalog, entity_type) return None LOGGER.info('Got %d links from %s %s', count, catalog, entity_type) result = query.all() session.commit() except: session.rollback() raise finally: session.close() if result is None: return None for row in result: yield row.catalog_id, row.url
def check_links_cli(catalog: str): """ Check for rotten URLs of an imported catalog. :param catalog: one of the keys of constants.TARGET_CATALOGS """ for entity_type in target_database.supported_entities_for_target(catalog): LOGGER.info("Validating %s %s links...", catalog, entity_type) entity = target_database.get_link_entity(catalog, entity_type) if not entity: LOGGER.info( "%s %s does not have a links table. Skipping...", catalog, entity_type, ) continue session = DBManager.connect_to_db() total = session.query(entity).count() removed = 0 with Pool() as pool: # Validate each link for resolved, res_entity in tqdm( pool.imap_unordered(_resolve_url, session.query(entity)), total=total, ): if not resolved: session_delete = DBManager.connect_to_db() # if not valid delete session_delete.delete(res_entity) try: session_delete.commit() removed += 1 except: session.rollback() raise finally: session_delete.close() session.close() LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog, entity_type)
def perfect_name_search(target_entity: T, to_search: str) -> Iterable[T]: session = DBManager.connect_to_db() try: for r in session.query(target_entity).filter( target_entity.name == to_search).all(): yield r session.commit() except: session.rollback() raise finally: session.close()
def perfect_name_search(target_entity: constants.DB_ENTITY, to_search: str) -> Iterable[constants.DB_ENTITY]: session = DBManager.connect_to_db() try: for r in (session.query(target_entity).filter( target_entity.name == to_search).all()): yield r except: session.rollback() raise finally: session.close()
def name_fulltext_search(target_entity: T, query: str) -> Iterable[T]: ft_search = target_entity.name.match(query) session = DBManager.connect_to_db() try: for r in session.query(target_entity).filter(ft_search).all(): yield r session.commit() except: session.rollback() raise finally: session.close()
def tokens_fulltext_search( target_entity: constants.DB_ENTITY, boolean_mode: bool, tokens: Iterable[str], where_clause=None, limit: int = 10, ) -> Iterable[constants.DB_ENTITY]: if issubclass(target_entity, models.base_entity.BaseEntity): column = target_entity.name_tokens elif issubclass(target_entity, models.base_link_entity.BaseLinkEntity): column = target_entity.url_tokens elif issubclass(target_entity, models.base_nlp_entity.BaseNlpEntity): column = target_entity.description_tokens else: LOGGER.critical('Bad target entity class: %s', target_entity) raise ValueError('Bad target entity class: %s' % target_entity) tokens = filter(None, tokens) terms = (' '.join(map('+{0}'.format, tokens)) if boolean_mode else ' '.join(tokens)) ft_search = column.match(terms) session = DBManager.connect_to_db() try: if where_clause is None: query = session.query(target_entity).filter(ft_search).limit(limit) else: query = (session.query(target_entity).filter(ft_search).filter( where_clause).limit(limit)) count = query.count() if count == 0: LOGGER.debug( "No result from full-text index query to %s. Terms: '%s'", target_entity.__name__, terms, ) session.commit() else: for row in query: yield row session.commit() except: session.rollback() raise finally: session.close()
def check_existence(class_or_occupation_query, class_qid, catalog_pid, entity: BaseEntity): query_type = 'identifier', class_or_occupation_query session = DBManager.connect_to_db() invalid = defaultdict(set) count = 0 for result in sparql_queries.run_identifier_or_links_query( query_type, class_qid, catalog_pid, 0): for qid, target_id in result.items(): results = session.query(entity).filter( entity.catalog_id == target_id).all() if not results: LOGGER.warning('%s identifier %s is invalid', qid, target_id) invalid[target_id].add(qid) count += 1 LOGGER.info('Total invalid identifiers = %d', count) # Sets are not serializable to JSON, so cast them to lists return {target_id: list(qids) for target_id, qids in invalid.items()}
def gather_target_links(entity, catalog): LOGGER.info('Gathering %s %s links ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) # Early return when the links table doesn't exist if link_entity is None: LOGGER.warning( 'No links table available in the database for %s %s. ' 'Stopping validation here', catalog, entity, ) return None session = DBManager.connect_to_db() result = None try: query = session.query(link_entity.catalog_id, link_entity.url) count = query.count() # Early return when no links if count == 0: LOGGER.warning( 'No links available for %s %s. Stopping validation here', catalog, entity, ) return None LOGGER.info('Got %d links from %s %s', count, catalog, entity) result = query.all() session.commit() except: session.rollback() raise finally: session.close() if result is None: return None for row in result: yield row.catalog_id, row.url
def tokens_fulltext_search(target_entity: T, boolean_mode: bool, tokens: Iterable[str]) -> Iterable[T]: query = None if boolean_mode: query = ' '.join(map('+{0}'.format, tokens)) else: query = ' '.join(tokens) ft_search = target_entity.tokens.match(query) session = DBManager.connect_to_db() result = [] try: result = session.query(target_entity).filter(ft_search).all() session.commit() except: session.rollback() raise if not result: return [] return result
def gather_target_biodata(entity, catalog): LOGGER.info( 'Gathering %s birth/death dates/places and gender metadata ...', catalog) db_entity = target_database.get_main_entity(catalog, entity) # Base biodata query_fields = _build_biodata_query_fields(db_entity, entity, catalog) session = DBManager.connect_to_db() query = session.query(*query_fields).filter( or_(db_entity.born.isnot(None), db_entity.died.isnot(None))) result = None try: raw_result = _run_query(query, catalog, entity) if raw_result is None: return None result = _parse_target_biodata_query_result(raw_result) session.commit() except: session.rollback() raise finally: session.close() return result
def check_urls_cli(catalog, drop, dir_io): """Check for rotten URLs of an imported catalog. For every catalog entity, dump rotten URLs to a file. CSV format: URL,catalog_ID Use '-d' to drop rotten URLs from the DB on the fly. """ for entity in target_database.supported_entities_for_target(catalog): out_path = os.path.join( dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity) ) LOGGER.info('Starting check of %s %s URLs ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) if not link_entity: LOGGER.info( '%s %s does not have a links table. Skipping ...', catalog, entity, ) continue query_session = DBManager.connect_to_db() total = query_session.query(link_entity).count() rotten = 0 if drop: removed = 0 # Parallel operation with Pool() as pool, open(out_path, 'w', buffering=1) as fout: writer = csv.writer(fout) try: # Resolve every URL for resolved, result in tqdm( pool.imap_unordered(_resolve, query_session.query(link_entity)), total=total, ): if not resolved: # Dump writer.writerow((result.url, result.catalog_id)) rotten += 1 # Drop from DB if drop: delete_session = DBManager.connect_to_db() delete_session.delete(result) try: delete_session.commit() removed += 1 except SQLAlchemyError as error: LOGGER.error( 'Failed deletion of %s: %s', result, error.__class__.__name__, ) LOGGER.debug(error) delete_session.rollback() finally: delete_session.close() except SQLAlchemyError as error: LOGGER.error( '%s while querying %s %s URLs', error.__class__.__name__, catalog, entity, ) LOGGER.debug(error) session.rollback() finally: query_session.close() LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info()) LOGGER.info( "Total %s %s rotten URLs dumped to '%s': %d / %d", catalog, entity, out_path, rotten, total, ) if drop: LOGGER.info( 'Total %s %s rotten URLs dropped from the DB: %d / %d', catalog, entity, rotten, removed, )
def dead_ids(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, Dict]: """Look for dead identifiers in Wikidata. An identifier is dead if it does not exist in the given catalog when this function is executed. Dead identifiers should be marked with a deprecated rank in Wikidata. **How it works:** 1. gather identifiers of the given catalog from relevant Wikidata items 2. look them up in the given catalog 3. if an identifier is not in the given catalog anymore, it should be deprecated :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata in a previous run :return: the ``dict`` pair of dead identifiers and identifiers gathered from Wikidata """ dead = defaultdict(set) db_entity = target_database.get_main_entity(catalog, entity) # Wikidata side if wd_cache is None: wd_ids = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_ids, ) else: wd_ids = wd_cache # Target catalog side session = DBManager.connect_to_db() try: for qid in wd_ids: for tid in wd_ids[qid][keys.TID]: existing = (session.query( db_entity.catalog_id).filter_by(catalog_id=tid).count()) if existing == 0: LOGGER.debug('%s %s identifier %s is dead', qid, catalog, tid) dead[tid].add(qid) session.commit() except SQLAlchemyError as error: LOGGER.error( "Failed query of target catalog identifiers due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.validator DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() finally: session.close() LOGGER.info( 'Check completed. Target: %s %s. Total dead identifiers: %d', catalog, entity, len(dead), ) return dead, wd_ids