def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def xref_datasets(base: Dataset, candidates: Dataset, limit: int = 15): resolver = get_resolver() resolver.prune() if candidates not in base.provided_datasets(): raise RuntimeError("%r is not contained in %r" % (candidates, base)) db = Database(base, resolver, cached=True) entities = db.view(candidates) loader = db.view(base) index = get_index(base, loader) xref(index, resolver, entities, limit=limit) resolver.save()
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" context = Context(dataset) context.bind() loader = database.view(dataset, export_assembler) exporters = [Exporter(context, loader) for Exporter in EXPORTERS] for entity in loader: for exporter in exporters: exporter.feed(entity) for exporter in exporters: exporter.finish() # Make sure the exported resources are visible in the database db.session.commit() # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query(dataset=dataset).all()} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = dataset.to_index() write_json(meta, fh) context.close()
def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def index(dataset): resolver = get_resolver() # Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) loader = database.view(dataset) path = get_index_path(dataset) path.unlink(missing_ok=True) get_index(dataset, loader)
def build_analytics(dataset: Dataset): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) db = Database(dataset, resolver) loader = db.view(dataset) with engine_tx() as conn: conn.execute(delete(analytics_dataset_table)) conn.execute(delete(analytics_country_table)) conn.execute(delete(analytics_entity_table)) entities: List[Dict[str, Any]] = [] members: List[Dict[str, str]] = [] countries: List[Dict[str, str]] = [] for idx, entity in enumerate(loader): if idx > 0 and idx % 10000 == 0: log.info("Denormalised %d entities..." % idx) for dataset in Dataset.all(): if len(entity.datasets.intersection(dataset.scope_names)) > 0: members.append({"entity_id": entity.id, "dataset": dataset.name}) if len(members) >= BATCH_SIZE: stmt = insert(analytics_dataset_table).values(members) conn.execute(stmt) members = [] for country in entity.get_type_values(registry.country): countries.append({"entity_id": entity.id, "country": country}) if len(countries) >= BATCH_SIZE: stmt = insert(analytics_country_table).values(countries) conn.execute(stmt) countries = [] ent = { "id": entity.id, "schema": entity.schema.name, "caption": entity.caption, "target": entity.target, "first_seen": entity.first_seen, "last_seen": entity.last_seen, "properties": entity.properties, } entities.append(ent) if len(entities) >= BATCH_SIZE: stmt = insert(analytics_entity_table).values(entities) conn.execute(stmt) entities = [] if len(members): conn.execute(insert(analytics_dataset_table).values(members)) if len(entities): conn.execute(insert(analytics_entity_table).values(entities))
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver) DedupeApp.run( title="OpenSanction De-duplication", # log="textual.log", loader=db.view(dataset), resolver=resolver, )
def blocking_xref(dataset: Dataset, limit: int = 5000): resolver = get_resolver() resolver.prune() db = Database(dataset, resolver, cached=True, external=True) loader = db.view(dataset) xref( loader, resolver, limit=limit, scored=True, auto_threshold=0.990, user=AUTO_USER, ) resolver.save()
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver, external=True) loader = db.view(dataset) async def run_app() -> None: app = DedupeApp( loader=loader, resolver=resolver, url_base="https://opensanctions.org/entities/%s/", title="OpenSanction De-duplication", log="textual.log", ) # type: ignore await app.process_messages() asyncio.run(run_app())
def xref_internal(dataset: Dataset): resolver = get_resolver() resolver.prune() db = Database(dataset, resolver) loader = db.view(dataset) index = get_index(dataset, loader) suggested = 0 for pair, score in index.pairs(): left = loader.get_entity(str(pair[0])) right = loader.get_entity(str(pair[1])) if left is None or right is None: continue if left.schema not in right.schema.matchable_schemata: if right.schema not in left.schema.matchable_schemata: continue if not resolver.check_candidate(left.id, right.id): continue resolver.suggest(left.id, right.id, score) if suggested > 5000: break suggested += 1 resolver.save()
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" try: context = Context(dataset) loader = database.view(dataset, assemble) if dataset.type != External.TYPE: export_data(context, loader) # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "wb") as fh: with engine.begin() as conn: data = {"issues": list(all_issues(conn, dataset))} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "wb") as fh: meta = dataset_to_index(dataset) write_json(meta, fh) finally: context.close()