def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def __init__(self, dataset): self.dataset = dataset self.path = settings.DATASET_PATH.joinpath(dataset.name) self.http = get_session() self.resolver = get_resolver() self.log = structlog.get_logger(dataset.name) self._statements = {}
def export(dataset): resolver = get_resolver() Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def explode(canonical_id): resolver = get_resolver() resolved_id = resolver.get_canonical(canonical_id) with engine_tx() as conn: for entity_id in resolver.explode(resolved_id): log.info("Restore separate entity", entity=entity_id) resolve_canonical(conn, resolver, entity_id) resolver.save()
def audit_resolver(): wd = get_wikidata_enricher() resolver = get_resolver() log.info("Loading all entity IDs...") with engine_read() as conn: entities = list(entities_datasets(conn)) entity_ids = set([e for e, _ in entities]) log.info("Loaded %d entity IDs..." % len(entity_ids)) canonicals = list(resolver.canonicals()) for idx, canonical in enumerate(canonicals): if idx > 0 and idx % 10000 == 0: log.info("Processed: %d..." % idx) members = resolver.connected(canonical) qids = set() ofacs = 0 for member in members: if is_qid(member.id): qid = member.id item = wd.fetch_item(qid) if item is None: log.error("Missing WD item", qid=qid) if item.id != qid: item = wd.fetch_item(qid, cache_days=0) judgement = resolver.get_judgement(member, item.id) if judgement != Judgement.POSITIVE: resolver.decide( member, item.id, judgement=Judgement.POSITIVE, user="******", ) qid = item.id qids.add(qid) if member.id.startswith("ofac-"): ofacs += 1 if member.id not in entity_ids: if member.canonical: continue log.warn( "Referenced entity does not exist", canonical=canonical.id, entity=member.id, ) resolver.remove(member) if ofacs > 1: log.warning("More than one OFAC ID", id=canonical.id, size=len(members)) if len(qids) > 1: log.error("Entity has more than one QID", qids=qids) resolver.save()
def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def index(dataset): resolver = get_resolver() # Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) loader = database.view(dataset) path = get_index_path(dataset) path.unlink(missing_ok=True) get_index(dataset, loader)
def build_analytics(dataset: Dataset): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) db = Database(dataset, resolver) loader = db.view(dataset) with engine_tx() as conn: conn.execute(delete(analytics_dataset_table)) conn.execute(delete(analytics_country_table)) conn.execute(delete(analytics_entity_table)) entities: List[Dict[str, Any]] = [] members: List[Dict[str, str]] = [] countries: List[Dict[str, str]] = [] for idx, entity in enumerate(loader): if idx > 0 and idx % 10000 == 0: log.info("Denormalised %d entities..." % idx) for dataset in Dataset.all(): if len(entity.datasets.intersection(dataset.scope_names)) > 0: members.append({"entity_id": entity.id, "dataset": dataset.name}) if len(members) >= BATCH_SIZE: stmt = insert(analytics_dataset_table).values(members) conn.execute(stmt) members = [] for country in entity.get_type_values(registry.country): countries.append({"entity_id": entity.id, "country": country}) if len(countries) >= BATCH_SIZE: stmt = insert(analytics_country_table).values(countries) conn.execute(stmt) countries = [] ent = { "id": entity.id, "schema": entity.schema.name, "caption": entity.caption, "target": entity.target, "first_seen": entity.first_seen, "last_seen": entity.last_seen, "properties": entity.properties, } entities.append(ent) if len(entities) >= BATCH_SIZE: stmt = insert(analytics_entity_table).values(entities) conn.execute(stmt) entities = [] if len(members): conn.execute(insert(analytics_dataset_table).values(members)) if len(entities): conn.execute(insert(analytics_entity_table).values(entities))
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver) DedupeApp.run( title="OpenSanction De-duplication", # log="textual.log", loader=db.view(dataset), resolver=resolver, )
def run(dataset): dataset = Dataset.require(dataset) resolver = get_resolver() for source in dataset.sources: Context(source).crawl() Statement.resolve_all(resolver) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def blocking_xref(dataset: Dataset, limit: int = 5000): resolver = get_resolver() resolver.prune() db = Database(dataset, resolver, cached=True, external=True) loader = db.view(dataset) xref( loader, resolver, limit=limit, scored=True, auto_threshold=0.990, user=AUTO_USER, ) resolver.save()
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver, external=True) loader = db.view(dataset) async def run_app() -> None: app = DedupeApp( loader=loader, resolver=resolver, url_base="https://opensanctions.org/entities/%s/", title="OpenSanction De-duplication", log="textual.log", ) # type: ignore await app.process_messages() asyncio.run(run_app())
def merge(entity_ids): if len(entity_ids) < 2: return resolver = get_resolver() canonical_id = resolver.get_canonical(entity_ids[0]) for other_id in entity_ids[1:]: other_id = Identifier.get(other_id) other_canonical_id = resolver.get_canonical(other_id) if other_canonical_id == canonical_id: continue check = resolver.check_candidate(canonical_id, other_id) if not check: log.error( "Cannot merge", canonical_id=canonical_id, other_id=other_id, edge=resolver.get_resolved_edge(canonical_id, other_id), ) return log.info("Merge: %s -> %s" % (other_id, canonical_id)) canonical_id = resolver.decide(canonical_id, other_id, Judgement.POSITIVE) resolver.save() log.info("Canonical: %s" % canonical_id)
def explode(canonical_id): resolver = get_resolver() resolved_id = resolver.get_canonical(canonical_id) resolver.explode(resolved_id) resolver.save() Statement.resolve_all(resolver)
def dump_dataset(dataset, outfile): dataset = Dataset.require(dataset) resolver = get_resolver() loader = Database(dataset, resolver).view(dataset) for entity in loader: write_object(outfile, entity)
def xref_prune(): resolver = get_resolver() resolver.prune() resolver.save()
def resolve(): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver)
def resolve(): resolver = get_resolver() Statement.resolve_all(resolver)
from followthemoney.schema import Schema from followthemoney.property import Property from nomenklatura.index.index import Index from nomenklatura.loader import Loader from opensanctions.model import Statement from opensanctions.core.entity import Entity from opensanctions.core.dataset import Dataset from opensanctions.core.index import get_index as get_dataset_index from opensanctions.core.resolver import get_resolver from opensanctions.core.loader import Database from osapi import settings from osapi.models import FreebaseType from osapi.models import FreebaseEntity, FreebaseProperty resolver = get_resolver() def get_scope() -> Dataset: scope = Dataset.get(settings.SCOPE_DATASET) if scope is None: raise RuntimeError("Cannot load dataset: %s" % settings.SCOPE_DATASET) return scope @cache def get_database() -> Database: return Database(get_scope(), resolver, cached=settings.CACHED) @cache
def xref_prune(keep=0): resolver = get_resolver() resolver.prune(keep=keep) resolver.save()