예제 #1
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
예제 #2
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
예제 #3
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
예제 #4
0
def build_analytics(dataset: Dataset):
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    with engine_tx() as conn:
        conn.execute(delete(analytics_dataset_table))
        conn.execute(delete(analytics_country_table))
        conn.execute(delete(analytics_entity_table))

        entities: List[Dict[str, Any]] = []
        members: List[Dict[str, str]] = []
        countries: List[Dict[str, str]] = []
        for idx, entity in enumerate(loader):
            if idx > 0 and idx % 10000 == 0:
                log.info("Denormalised %d entities..." % idx)

            for dataset in Dataset.all():
                if len(entity.datasets.intersection(dataset.scope_names)) > 0:
                    members.append({"entity_id": entity.id, "dataset": dataset.name})

            if len(members) >= BATCH_SIZE:
                stmt = insert(analytics_dataset_table).values(members)
                conn.execute(stmt)
                members = []

            for country in entity.get_type_values(registry.country):
                countries.append({"entity_id": entity.id, "country": country})

            if len(countries) >= BATCH_SIZE:
                stmt = insert(analytics_country_table).values(countries)
                conn.execute(stmt)
                countries = []

            ent = {
                "id": entity.id,
                "schema": entity.schema.name,
                "caption": entity.caption,
                "target": entity.target,
                "first_seen": entity.first_seen,
                "last_seen": entity.last_seen,
                "properties": entity.properties,
            }
            entities.append(ent)

            if len(entities) >= BATCH_SIZE:
                stmt = insert(analytics_entity_table).values(entities)
                conn.execute(stmt)
                entities = []

        if len(members):
            conn.execute(insert(analytics_dataset_table).values(members))

        if len(entities):
            conn.execute(insert(analytics_entity_table).values(entities))
예제 #5
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
예제 #6
0
def xref_datasets(base: Dataset, candidates: Dataset, limit: int = 15):
    resolver = get_resolver()
    resolver.prune()
    if candidates not in base.provided_datasets():
        raise RuntimeError("%r is not contained in %r" % (candidates, base))
    db = Database(base, resolver, cached=True)
    entities = db.view(candidates)
    loader = db.view(base)
    index = get_index(base, loader)
    xref(index, resolver, entities, limit=limit)
    resolver.save()
예제 #7
0
def blocking_xref(dataset: Dataset, limit: int = 5000):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver, cached=True, external=True)
    loader = db.view(dataset)
    xref(
        loader,
        resolver,
        limit=limit,
        scored=True,
        auto_threshold=0.990,
        user=AUTO_USER,
    )
    resolver.save()
예제 #8
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    context = Context(dataset)
    context.bind()
    loader = database.view(dataset, export_assembler)
    exporters = [Exporter(context, loader) for Exporter in EXPORTERS]
    for entity in loader:
        for exporter in exporters:
            exporter.feed(entity)

    for exporter in exporters:
        exporter.finish()

    # Make sure the exported resources are visible in the database
    db.session.commit()

    # Export list of data issues from crawl stage
    issues_path = context.get_resource_path("issues.json")
    context.log.info("Writing dataset issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query(dataset=dataset).all()}
        write_json(data, fh)

    # Export full metadata
    index_path = context.get_resource_path("index.json")
    context.log.info("Writing dataset index", path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = dataset.to_index()
        write_json(meta, fh)

    context.close()
예제 #9
0
def export(dataset):
    resolver = get_resolver()
    Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
예제 #10
0
파일: cli.py 프로젝트: nightsh/opennames
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
예제 #11
0
def run(dataset):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    for source in dataset.sources:
        Context(source).crawl()
    Statement.resolve_all(resolver)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
예제 #12
0
def xref_internal(dataset: Dataset):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    index = get_index(dataset, loader)
    suggested = 0
    for pair, score in index.pairs():
        left = loader.get_entity(str(pair[0]))
        right = loader.get_entity(str(pair[1]))
        if left is None or right is None:
            continue
        if left.schema not in right.schema.matchable_schemata:
            if right.schema not in left.schema.matchable_schemata:
                continue
        if not resolver.check_candidate(left.id, right.id):
            continue
        resolver.suggest(left.id, right.id, score)
        if suggested > 5000:
            break
        suggested += 1
    resolver.save()
예제 #13
0
def export_pairs(dataset: Dataset):
    resolver = get_resolver()
    db = Database(dataset, resolver, cached=True)
    datasets: Dict[str, Dataset] = defaultdict(set)
    for entity_id, ds in Statement.entities_datasets(dataset):
        dsa = Dataset.get(ds)
        if dsa is not None:
            datasets[entity_id].add(dsa)

    def get_parts(id):
        canonical_id = resolver.get_canonical(id)
        for ref in resolver.get_referents(canonical_id):
            for ds in datasets.get(ref, []):
                yield ref, ds

    pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]],
                Judgement] = {}
    for canonical_id in resolver.canonicals():
        parts = list(get_parts(canonical_id))
        for left, right in combinations(parts, 2):
            left, right = max(left, right), min(left, right)
            pairs[(left, right)] = Judgement.POSITIVE
        for edge in resolver.nodes[canonical_id]:
            if edge.judgement == Judgement.NEGATIVE:
                source_canonical = resolver.get_canonical(edge.source)
                other = edge.target if source_canonical == canonical_id else edge.source
                for other_part in get_parts(other):
                    for part in parts:
                        part, other_part = max(part, other_part), min(
                            part, other_part)
                        pairs[(part, other_part)] = Judgement.NEGATIVE

    def get_partial(spec):
        id, ds = spec
        loader = db.view(ds)
        canonical = resolver.get_canonical(id)
        entity = loader.get_entity(canonical)
        if entity is not None:
            return entity.to_nested_dict(loader)

    for (left, right), judgement in pairs.items():
        # yield [left[0], right[0], judgement]
        left_entity = get_partial(left)
        right_entity = get_partial(right)
        if left_entity is not None and right_entity is not None:
            yield {
                "left": left_entity,
                "right": right_entity,
                "judgement": judgement
            }
예제 #14
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    try:
        context = Context(dataset)
        loader = database.view(dataset, assemble)
        if dataset.type != External.TYPE:
            export_data(context, loader)

        # Export list of data issues from crawl stage
        issues_path = context.get_resource_path("issues.json")
        context.log.info("Writing dataset issues list", path=issues_path)
        with open(issues_path, "wb") as fh:
            with engine.begin() as conn:
                data = {"issues": list(all_issues(conn, dataset))}
            write_json(data, fh)

        # Export full metadata
        index_path = context.get_resource_path("index.json")
        context.log.info("Writing dataset index", path=index_path)
        with open(index_path, "wb") as fh:
            meta = dataset_to_index(dataset)
            write_json(meta, fh)
    finally:
        context.close()
예제 #15
0
def get_database() -> Database:
    return Database(get_scope(), resolver, cached=settings.CACHED)
예제 #16
0
def export_pairs(dataset: Dataset):
    resolver = get_resolver()
    db = Database(dataset, resolver, cached=True, external=True)
    datasets: Dict[str, Set[Dataset]] = defaultdict(set)
    with engine_read() as conn:
        for entity_id, ds in entities_datasets(conn, dataset):
            dsa = Dataset.get(ds)
            if dsa is not None:
                datasets[entity_id].add(dsa)

    def get_parts(id):
        canonical_id = resolver.get_canonical(id)
        for ref in resolver.get_referents(canonical_id):
            if ref.startswith(Identifier.PREFIX):
                continue
            for ds in datasets.get(ref, []):
                yield ref, ds

    pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]],
                Judgement] = {}
    for canonical_id in resolver.canonicals():
        parts = list(get_parts(canonical_id))
        for left, right in combinations(parts, 2):
            left, right = max(left, right), min(left, right)
            pairs[(left, right)] = Judgement.POSITIVE
        for edge in resolver.nodes[canonical_id]:
            if edge.judgement in (Judgement.NEGATIVE, Judgement.UNSURE):
                source_canonical = resolver.get_canonical(edge.source)
                other = edge.target if source_canonical == canonical_id else edge.source
                for other_part in get_parts(other):
                    for part in parts:
                        part, other_part = max(part, other_part), min(
                            part, other_part)
                        # pairs[(part, other_part)] = edge.judgement
                        # Export unsure as negative:
                        pairs[(part, other_part)] = Judgement.NEGATIVE

    def get_partial(spec: Tuple[str, Dataset]) -> Optional[Dict[str, Any]]:
        id, ds = spec
        # HACK: EP is messing up phone and email-based matching
        if ds.name in (
                "everypolitician",
                "wd_curated",
                "wd_peppercat_leaders",
                "wd_peppercat_legislators",
                "us_cia_world_leaders",
                "eu_sanctions_map",
                "ca_dfatd_sema_sanctions",
                "ru_navalny35",
                "wd_oligarchs",
        ):
            return None
        loader = db.view(ds)
        canonical = resolver.get_canonical(id)
        entity = loader.get_entity(canonical)
        if entity is None:
            return None
        entity.id = id
        return entity.to_dict()

    for (left, right), judgement in pairs.items():
        # yield [left[0], right[0], judgement]
        left_entity = get_partial(left)
        right_entity = get_partial(right)
        if left_entity is not None and right_entity is not None:
            yield {
                "left": left_entity,
                "right": right_entity,
                "judgement": judgement
            }
예제 #17
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    loader = Database(dataset, resolver).view(dataset)
    for entity in loader:
        write_object(outfile, entity)