def _upload(catalog, entity, to_deprecate, to_add, sandbox): catalog_qid = target_database.get_catalog_qid(catalog) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog, entity, to_deprecate, sandbox) LOGGER.info('Starting addition of statements to Wikidata ...') wikidata_bot.add_people_statements(to_add, catalog_qid, sandbox) return catalog_qid
def _handle_result( result: Iterable[Tuple[str, str, str]], origin: str, catalog: str, path_out: str, upload: bool, sandbox: bool, ): if upload: to_upload = set() # In-memory copy of the result generator with open(path_out, 'w', 1) as fout: writer = csv.writer(fout) for statement in result: writer.writerow(statement) if upload: to_upload.add(statement) if upload: wikidata_bot.add_people_statements(to_upload, catalog, sandbox) LOGGER.info('%s %s dumped to %s', catalog, origin, path_out)
def _upload_result(catalog, entity, to_deprecate, urls_to_add, ext_ids_to_add, sandbox): _upload(catalog, entity, to_deprecate, urls_to_add, sandbox) LOGGER.info('Starting addition of external IDs to Wikidata ...') wikidata_bot.add_people_statements(ext_ids_to_add, sandbox)
def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): """Validate identifiers against biographical data. Look for birth/death dates, birth/death places, gender. Dump 4 output files: 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} 2. statements to be added. CSV format: QID,PID,value,catalog_ID 3. shared statements to be referenced. Same format as file #2 4. statements found in Wikidata but not in the target catalog. CSV format: catalog_ID,PID_URL,value,QID_URL You can pass the '-u' flag to upload the output to Wikidata. """ criterion = 'bio' # Output paths deprecate_path = os.path.join( dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) add_path = os.path.join( dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(catalog=catalog, entity=entity), ) ref_path = os.path.join( dir_io, SHARED_STATEMENTS_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) wd_stmts_path = os.path.join( dir_io, WD_STATEMENTS_FNAME.format(criterion=criterion, catalog=catalog, entity=entity), ) wd_cache_path = os.path.join( dir_io, WD_CACHE_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) # Wikidata cache wd_cache = None if os.path.isfile(wd_cache_path): with open(wd_cache_path, 'rb') as cin: wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Run validation result = bio(catalog, entity, wd_cache=wd_cache) # Nothing to do: the catalog doesn't contain biographical data if result is None: return # Unpack the result tuple deprecate, add, reference, wd_stmts, wd_cache = result # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add, add_path, 'statements to be added') _dump_csv_output(reference, ref_path, 'shared statements to be referenced') _dump_csv_output( wd_stmts, wd_stmts_path, f'statements in Wikidata but not in {catalog} {entity}', ) # Dump Wikidata cache if dump_wikidata: try: with open(wd_cache_path, 'wb') as cout: # Using the highest protocol available for the current Python # version should be the most efficient solution pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) LOGGER.info( 'Biographical data gathered from Wikidata dumped to %s', wd_cache_path, ) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') # Upload the output to Wikidata: # deprecate, add, reference if upload: if sandbox: LOGGER.info( 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2, ) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog, entity, deprecate, sandbox) LOGGER.info('Starting addition of extra statements to Wikidata ...') wikidata_bot.add_people_statements(catalog, add, criterion, sandbox) LOGGER.info( 'Starting referencing of shared statements in Wikidata ...') wikidata_bot.add_people_statements(catalog, reference, criterion, sandbox)
def links_cli(catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io): """Validate identifiers against links. Dump 6 output files: 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} 2. third-party IDs to be added. CSV format: QID,third-party_PID,third-party_ID,catalog_ID 3. URLs to be added. CSV format: QID,P2888,URL,catalog_ID 4. third-party IDs to be referenced. Same format as file #2 5. URLs to be referenced. Same format as file #3 6. URLs found in Wikidata but not in the target catalog. CSV format: catalog_ID,URL,QID_URL You can pass the '-u' flag to upload the output to Wikidata. The '-b' flag applies a URL blacklist of low-quality Web domains to file #3. """ criterion = 'links' # Output paths deprecate_path = os.path.join( dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) add_ext_ids_path = os.path.join( dir_io, EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='added'), ) add_urls_path = os.path.join( dir_io, URLS_FNAME.format(catalog=catalog, entity=entity, task='added')) ref_ext_ids_path = os.path.join( dir_io, EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='referenced'), ) ref_urls_path = os.path.join( dir_io, URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'), ) wd_urls_path = os.path.join( dir_io, WD_STATEMENTS_FNAME.format(criterion=criterion, catalog=catalog, entity=entity), ) wd_cache_path = os.path.join( dir_io, WD_CACHE_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) # Wikidata cache wd_cache = None if os.path.isfile(wd_cache_path): with open(wd_cache_path, 'rb') as cin: wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Run validation result = links(catalog, entity, url_blacklist=blacklist, wd_cache=wd_cache) # Nothing to do: the catalog doesn't contain links if result is None: return # Unpack the result tuple ( deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache, ) = result # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added') _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}') # Dump Wikidata cache if dump_wikidata: try: with open(wd_cache_path, 'wb') as cout: # Using the highest protocol available for the current Python # version should be the most efficient solution pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') # Upload the output to Wikidata if upload: if sandbox: LOGGER.info( 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2, ) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog, entity, deprecate, sandbox) LOGGER.info('Starting addition of external IDs to Wikidata ...') wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion, sandbox) LOGGER.info('Starting addition of URLs to Wikidata ...') wikidata_bot.add_people_statements(catalog, add_urls, criterion, sandbox) LOGGER.info( 'Starting referencing of shared external IDs in Wikidata ...') wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion, sandbox) LOGGER.info('Starting referencing of shared URLs in Wikidata ...') wikidata_bot.add_people_statements(catalog, add_urls, criterion, sandbox)