예제 #1
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
예제 #2
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
예제 #3
0
def export(dataset):
    resolver = get_resolver()
    Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
예제 #4
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
예제 #5
0
def run(dataset):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    for source in dataset.sources:
        Context(source).crawl()
    Statement.resolve_all(resolver)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
예제 #6
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
예제 #7
0
파일: cli.py 프로젝트: nightsh/opennames
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
예제 #8
0
    def feed(self, entity: Entity):
        if not entity.target:
            return
        countries = set(entity.get_type_values(registry.country))
        identifiers = set(entity.get_type_values(registry.identifier))
        names = set(entity.get_type_values(registry.name))
        names.discard(entity.caption)
        sanctions = set()
        addresses = set(entity.get("address"))

        for _, adjacent in self.loader.get_adjacent(entity):
            if adjacent.schema.is_a("Sanction"):
                sanctions.add(self.sanction_text(adjacent))

            if adjacent.schema.is_a("Address"):
                addresses.add(adjacent.caption)

            if adjacent.schema.is_a("Identification"):
                identifiers.update(adjacent.get("number"))
                countries.update(adjacent.get("country"))

        datasets: List[str] = []
        for dataset in entity.datasets:
            ds = Dataset.require(dataset)
            datasets.append(ds.title)
        row = [
            entity.id,
            entity.schema.name,
            entity.caption,
            self.concat_values(names),
            self.concat_values(entity.get("birthDate", quiet=True)),
            self.concat_values(countries),
            self.concat_values(addresses),
            self.concat_values(identifiers),
            self.concat_values(sanctions),
            self.concat_values(entity.get_type_values(registry.phone)),
            self.concat_values(entity.get_type_values(registry.email)),
            self.concat_values(datasets),
            entity.first_seen,
            entity.last_seen,
        ]
        self.writer.writerow(row)
예제 #9
0
파일: cli.py 프로젝트: nightsh/opennames
def xref(dataset, limit):
    dataset = Dataset.require(dataset)
    blocking_xref(dataset, limit=limit)
예제 #10
0
파일: cli.py 프로젝트: nightsh/opennames
def clear(dataset):
    dataset = Dataset.require(dataset)
    for source in dataset.sources:
        Context(source).clear()
예제 #11
0
파일: cli.py 프로젝트: nightsh/opennames
def build_analytics_(dataset):
    ds = Dataset.require(dataset)
    build_analytics(ds)
예제 #12
0
파일: cli.py 프로젝트: nightsh/opennames
def latest(dataset):
    ds = Dataset.require(dataset)
    with engine_tx() as conn:
        latest = max_last_seen(conn, ds)
        if latest is not None:
            print(latest.isoformat())
예제 #13
0
def xref_int(dataset):
    xref_internal(Dataset.require(dataset))
예제 #14
0
def crawl(dataset):
    dataset = Dataset.require(dataset)
    for source in dataset.sources:
        Context(source).crawl()
예제 #15
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    loader = Database(dataset, resolver).view(dataset)
    for entity in loader:
        write_object(outfile, entity)
예제 #16
0
def export_pairs_(dataset, outfile):
    dataset = Dataset.require(dataset)
    for obj in export_pairs(dataset):
        write_object(outfile, obj)
예제 #17
0
def lookup(name, value):
    # We don't want to duplicate the lookup configs in both YAML files,
    # so we're hard-coding that lookups go against the SDN config.
    sdn = Dataset.require("us_ofac_sdn")
    return sdn.lookups.get(name).match(value)
예제 #18
0
파일: cli.py 프로젝트: nightsh/opennames
def export_pairs_(dataset, outfile):
    dataset = Dataset.require(dataset)
    for obj in export_pairs(dataset):
        write_json(obj, outfile)
예제 #19
0
import json
from banal import ensure_list
from functools import lru_cache
from pantomime.types import JSON
from requests.exceptions import TooManyRedirects

from opensanctions.core import Dataset
from opensanctions import helpers as h

FORMATS = ["%d %b %Y", "%d %B %Y", "%Y", "%b %Y", "%B %Y"]
SDN = Dataset.require("us_ofac_sdn")


@lru_cache(maxsize=None)
def deref_url(context, url):
    try:
        res = context.http.get(url, stream=True)
        return res.url
    except TooManyRedirects:
        return url


def parse_result(context, result):
    type_ = result.pop("type", None)
    schema = context.lookup_value("type", type_)
    if schema is None:
        context.log.error("Unknown result type", type=type_)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(result.pop("id"))
예제 #20
0
def xref(base, candidates, limit=15):
    base_dataset = Dataset.require(base)
    candidates_dataset = Dataset.require(candidates)
    xref_datasets(base_dataset, candidates_dataset, limit=limit)