def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def index(dataset): resolver = get_resolver() # Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) loader = database.view(dataset) path = get_index_path(dataset) path.unlink(missing_ok=True) get_index(dataset, loader)
def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def build_analytics(dataset: Dataset): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) db = Database(dataset, resolver) loader = db.view(dataset) with engine_tx() as conn: conn.execute(delete(analytics_dataset_table)) conn.execute(delete(analytics_country_table)) conn.execute(delete(analytics_entity_table)) entities: List[Dict[str, Any]] = [] members: List[Dict[str, str]] = [] countries: List[Dict[str, str]] = [] for idx, entity in enumerate(loader): if idx > 0 and idx % 10000 == 0: log.info("Denormalised %d entities..." % idx) for dataset in Dataset.all(): if len(entity.datasets.intersection(dataset.scope_names)) > 0: members.append({"entity_id": entity.id, "dataset": dataset.name}) if len(members) >= BATCH_SIZE: stmt = insert(analytics_dataset_table).values(members) conn.execute(stmt) members = [] for country in entity.get_type_values(registry.country): countries.append({"entity_id": entity.id, "country": country}) if len(countries) >= BATCH_SIZE: stmt = insert(analytics_country_table).values(countries) conn.execute(stmt) countries = [] ent = { "id": entity.id, "schema": entity.schema.name, "caption": entity.caption, "target": entity.target, "first_seen": entity.first_seen, "last_seen": entity.last_seen, "properties": entity.properties, } entities.append(ent) if len(entities) >= BATCH_SIZE: stmt = insert(analytics_entity_table).values(entities) conn.execute(stmt) entities = [] if len(members): conn.execute(insert(analytics_dataset_table).values(members)) if len(entities): conn.execute(insert(analytics_entity_table).values(entities))
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver) DedupeApp.run( title="OpenSanction De-duplication", # log="textual.log", loader=db.view(dataset), resolver=resolver, )
def xref_datasets(base: Dataset, candidates: Dataset, limit: int = 15): resolver = get_resolver() resolver.prune() if candidates not in base.provided_datasets(): raise RuntimeError("%r is not contained in %r" % (candidates, base)) db = Database(base, resolver, cached=True) entities = db.view(candidates) loader = db.view(base) index = get_index(base, loader) xref(index, resolver, entities, limit=limit) resolver.save()
def blocking_xref(dataset: Dataset, limit: int = 5000): resolver = get_resolver() resolver.prune() db = Database(dataset, resolver, cached=True, external=True) loader = db.view(dataset) xref( loader, resolver, limit=limit, scored=True, auto_threshold=0.990, user=AUTO_USER, ) resolver.save()
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" context = Context(dataset) context.bind() loader = database.view(dataset, export_assembler) exporters = [Exporter(context, loader) for Exporter in EXPORTERS] for entity in loader: for exporter in exporters: exporter.feed(entity) for exporter in exporters: exporter.finish() # Make sure the exported resources are visible in the database db.session.commit() # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query(dataset=dataset).all()} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = dataset.to_index() write_json(meta, fh) context.close()
def export(dataset): resolver = get_resolver() Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver, external=True) loader = db.view(dataset) async def run_app() -> None: app = DedupeApp( loader=loader, resolver=resolver, url_base="https://opensanctions.org/entities/%s/", title="OpenSanction De-duplication", log="textual.log", ) # type: ignore await app.process_messages() asyncio.run(run_app())
def run(dataset): dataset = Dataset.require(dataset) resolver = get_resolver() for source in dataset.sources: Context(source).crawl() Statement.resolve_all(resolver) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def xref_internal(dataset: Dataset): resolver = get_resolver() resolver.prune() db = Database(dataset, resolver) loader = db.view(dataset) index = get_index(dataset, loader) suggested = 0 for pair, score in index.pairs(): left = loader.get_entity(str(pair[0])) right = loader.get_entity(str(pair[1])) if left is None or right is None: continue if left.schema not in right.schema.matchable_schemata: if right.schema not in left.schema.matchable_schemata: continue if not resolver.check_candidate(left.id, right.id): continue resolver.suggest(left.id, right.id, score) if suggested > 5000: break suggested += 1 resolver.save()
def export_pairs(dataset: Dataset): resolver = get_resolver() db = Database(dataset, resolver, cached=True) datasets: Dict[str, Dataset] = defaultdict(set) for entity_id, ds in Statement.entities_datasets(dataset): dsa = Dataset.get(ds) if dsa is not None: datasets[entity_id].add(dsa) def get_parts(id): canonical_id = resolver.get_canonical(id) for ref in resolver.get_referents(canonical_id): for ds in datasets.get(ref, []): yield ref, ds pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]], Judgement] = {} for canonical_id in resolver.canonicals(): parts = list(get_parts(canonical_id)) for left, right in combinations(parts, 2): left, right = max(left, right), min(left, right) pairs[(left, right)] = Judgement.POSITIVE for edge in resolver.nodes[canonical_id]: if edge.judgement == Judgement.NEGATIVE: source_canonical = resolver.get_canonical(edge.source) other = edge.target if source_canonical == canonical_id else edge.source for other_part in get_parts(other): for part in parts: part, other_part = max(part, other_part), min( part, other_part) pairs[(part, other_part)] = Judgement.NEGATIVE def get_partial(spec): id, ds = spec loader = db.view(ds) canonical = resolver.get_canonical(id) entity = loader.get_entity(canonical) if entity is not None: return entity.to_nested_dict(loader) for (left, right), judgement in pairs.items(): # yield [left[0], right[0], judgement] left_entity = get_partial(left) right_entity = get_partial(right) if left_entity is not None and right_entity is not None: yield { "left": left_entity, "right": right_entity, "judgement": judgement }
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" try: context = Context(dataset) loader = database.view(dataset, assemble) if dataset.type != External.TYPE: export_data(context, loader) # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "wb") as fh: with engine.begin() as conn: data = {"issues": list(all_issues(conn, dataset))} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "wb") as fh: meta = dataset_to_index(dataset) write_json(meta, fh) finally: context.close()
def get_database() -> Database: return Database(get_scope(), resolver, cached=settings.CACHED)
def export_pairs(dataset: Dataset): resolver = get_resolver() db = Database(dataset, resolver, cached=True, external=True) datasets: Dict[str, Set[Dataset]] = defaultdict(set) with engine_read() as conn: for entity_id, ds in entities_datasets(conn, dataset): dsa = Dataset.get(ds) if dsa is not None: datasets[entity_id].add(dsa) def get_parts(id): canonical_id = resolver.get_canonical(id) for ref in resolver.get_referents(canonical_id): if ref.startswith(Identifier.PREFIX): continue for ds in datasets.get(ref, []): yield ref, ds pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]], Judgement] = {} for canonical_id in resolver.canonicals(): parts = list(get_parts(canonical_id)) for left, right in combinations(parts, 2): left, right = max(left, right), min(left, right) pairs[(left, right)] = Judgement.POSITIVE for edge in resolver.nodes[canonical_id]: if edge.judgement in (Judgement.NEGATIVE, Judgement.UNSURE): source_canonical = resolver.get_canonical(edge.source) other = edge.target if source_canonical == canonical_id else edge.source for other_part in get_parts(other): for part in parts: part, other_part = max(part, other_part), min( part, other_part) # pairs[(part, other_part)] = edge.judgement # Export unsure as negative: pairs[(part, other_part)] = Judgement.NEGATIVE def get_partial(spec: Tuple[str, Dataset]) -> Optional[Dict[str, Any]]: id, ds = spec # HACK: EP is messing up phone and email-based matching if ds.name in ( "everypolitician", "wd_curated", "wd_peppercat_leaders", "wd_peppercat_legislators", "us_cia_world_leaders", "eu_sanctions_map", "ca_dfatd_sema_sanctions", "ru_navalny35", "wd_oligarchs", ): return None loader = db.view(ds) canonical = resolver.get_canonical(id) entity = loader.get_entity(canonical) if entity is None: return None entity.id = id return entity.to_dict() for (left, right), judgement in pairs.items(): # yield [left[0], right[0], judgement] left_entity = get_partial(left) right_entity = get_partial(right) if left_entity is not None and right_entity is not None: yield { "left": left_entity, "right": right_entity, "judgement": judgement }
def dump_dataset(dataset, outfile): dataset = Dataset.require(dataset) resolver = get_resolver() loader = Database(dataset, resolver).view(dataset) for entity in loader: write_object(outfile, entity)