示例#1
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
示例#2
0
def export_global_index():
    """Export the global index for all datasets."""
    datasets = []
    for dataset in Dataset.all():
        datasets.append(dataset.to_index())

    issues_path = settings.DATASET_PATH.joinpath("issues.json")
    log.info("Writing global issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query().all()}
        write_json(data, fh)

    index_path = settings.DATASET_PATH.joinpath("index.json")
    log.info("Writing global index", datasets=len(datasets), path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = {
            "datasets": datasets,
            "run_time": settings.RUN_TIME,
            "dataset_url": settings.DATASET_URL,
            "issues_url": urljoin(settings.DATASET_URL, "issues.json"),
            "model": model,
            "schemata": Statement.all_schemata(),
            "app": "opensanctions",
            "version": settings.VERSION,
        }
        write_json(meta, fh)
示例#3
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    context = Context(dataset)
    context.bind()
    loader = database.view(dataset, export_assembler)
    exporters = [Exporter(context, loader) for Exporter in EXPORTERS]
    for entity in loader:
        for exporter in exporters:
            exporter.feed(entity)

    for exporter in exporters:
        exporter.finish()

    # Make sure the exported resources are visible in the database
    db.session.commit()

    # Export list of data issues from crawl stage
    issues_path = context.get_resource_path("issues.json")
    context.log.info("Writing dataset issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query(dataset=dataset).all()}
        write_json(data, fh)

    # Export full metadata
    index_path = context.get_resource_path("index.json")
    context.log.info("Writing dataset index", path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = dataset.to_index()
        write_json(meta, fh)

    context.close()
示例#4
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
示例#5
0
def export(dataset):
    resolver = get_resolver()
    Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
示例#6
0
def run(dataset):
    dataset = Dataset.get(dataset)
    for source in dataset.sources:
        Context(source).crawl()
    for dataset_ in dataset.datasets:
        context = Context(dataset_)
        context.normalize()
        context.export()
示例#7
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
示例#8
0
def run(dataset):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    for source in dataset.sources:
        Context(source).crawl()
    Statement.resolve_all(resolver)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
示例#9
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
示例#10
0
文件: cli.py 项目: nightsh/opennames
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
示例#11
0
    def feed(self, entity: Entity):
        if not entity.target:
            return
        countries = set(entity.get_type_values(registry.country))
        identifiers = set(entity.get_type_values(registry.identifier))
        names = set(entity.get_type_values(registry.name))
        names.discard(entity.caption)
        sanctions = set()
        addresses = set(entity.get("address"))

        for _, adjacent in self.loader.get_adjacent(entity):
            if adjacent.schema.is_a("Sanction"):
                sanctions.add(self.sanction_text(adjacent))

            if adjacent.schema.is_a("Address"):
                addresses.add(adjacent.caption)

            if adjacent.schema.is_a("Identification"):
                identifiers.update(adjacent.get("number"))
                countries.update(adjacent.get("country"))

        datasets: List[str] = []
        for dataset in entity.datasets:
            ds = Dataset.require(dataset)
            datasets.append(ds.title)
        row = [
            entity.id,
            entity.schema.name,
            entity.caption,
            self.concat_values(names),
            self.concat_values(entity.get("birthDate", quiet=True)),
            self.concat_values(countries),
            self.concat_values(addresses),
            self.concat_values(identifiers),
            self.concat_values(sanctions),
            self.concat_values(entity.get_type_values(registry.phone)),
            self.concat_values(entity.get_type_values(registry.email)),
            self.concat_values(datasets),
            entity.first_seen,
            entity.last_seen,
        ]
        self.writer.writerow(row)
示例#12
0
文件: cli.py 项目: nightsh/opennames
from opensanctions.core import Dataset, Context, setup
from opensanctions.exporters.statements import export_statements_path
from opensanctions.exporters.statements import import_statements_path
from opensanctions.core.audit import audit_resolver
from opensanctions.core.loader import Database
from opensanctions.core.resolver import AUTO_USER, export_pairs, get_resolver
from opensanctions.core.xref import blocking_xref
from opensanctions.core.statements import max_last_seen
from opensanctions.core.statements import resolve_all_canonical, resolve_canonical
from opensanctions.core.analytics import build_analytics
from opensanctions.core.db import engine_tx
from opensanctions.processing import run_enrich, run_pipeline
from opensanctions.util import write_json

log = get_logger(__name__)
datasets = click.Choice(Dataset.names())


@click.group(help="OpenSanctions ETL toolkit")
@click.option("-v", "--verbose", is_flag=True, default=False)
@click.option("-q", "--quiet", is_flag=True, default=False)
def cli(verbose=False, quiet=False):
    level = logging.INFO
    if quiet:
        level = logging.WARNING
    if verbose:
        level = logging.DEBUG
    setup(log_level=level)


@cli.command("crawl", help="Crawl entities into the given dataset")
示例#13
0
def xref_int(dataset):
    xref_internal(Dataset.require(dataset))
示例#14
0
def export_pairs_(dataset, outfile):
    dataset = Dataset.require(dataset)
    for obj in export_pairs(dataset):
        write_object(outfile, obj)
示例#15
0
文件: cli.py 项目: nightsh/opennames
def xref(dataset, limit):
    dataset = Dataset.require(dataset)
    blocking_xref(dataset, limit=limit)
示例#16
0
def xref(base, candidates, limit=15):
    base_dataset = Dataset.require(base)
    candidates_dataset = Dataset.require(candidates)
    xref_datasets(base_dataset, candidates_dataset, limit=limit)
示例#17
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    loader = Database(dataset, resolver).view(dataset)
    for entity in loader:
        write_object(outfile, entity)
示例#18
0
def crawl(dataset):
    dataset = Dataset.get(dataset)
    for source in dataset.sources:
        Context(source).crawl()
示例#19
0
@click.group(help="OpenSanctions ETL toolkit")
@click.option("-v", "--verbose", is_flag=True, default=False)
@click.option("-q", "--quiet", is_flag=True, default=False)
def cli(verbose=False, quiet=False):
    level = logging.INFO
    if quiet:
        level = logging.ERROR
    if verbose:
        level = logging.DEBUG
    setup(log_level=level)


@cli.command("dump", help="Export the entities from a dataset")
@click.argument("dataset",
                default=Dataset.ALL,
                type=click.Choice(Dataset.names()))
@click.option("-o", "--outfile", type=click.File("w"), default="-")
def dump_dataset(dataset, outfile):
    dataset = Dataset.get(dataset)
    context = Context(dataset)
    context.normalize()
    for entity in dataset.store:
        write_object(outfile, entity)


@cli.command("crawl", help="Crawl entities into the given dataset")
@click.argument("dataset",
                default=Dataset.ALL,
                type=click.Choice(Dataset.names()))
def crawl(dataset):
    dataset = Dataset.get(dataset)
示例#20
0
def crawl(dataset):
    dataset = Dataset.require(dataset)
    for source in dataset.sources:
        Context(source).crawl()
示例#21
0
def lookup(name, value):
    # We don't want to duplicate the lookup configs in both YAML files,
    # so we're hard-coding that lookups go against the SDN config.
    sdn = Dataset.require("us_ofac_sdn")
    return sdn.lookups.get(name).match(value)
示例#22
0
文件: cli.py 项目: nightsh/opennames
def export_pairs_(dataset, outfile):
    dataset = Dataset.require(dataset)
    for obj in export_pairs(dataset):
        write_json(obj, outfile)
示例#23
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.get(dataset)
    for source in dataset.sources:
        # TODO: consolidate the data
        for entity in source.store:
            write_object(outfile, entity)
示例#24
0
文件: cli.py 项目: nightsh/opennames
def latest(dataset):
    ds = Dataset.require(dataset)
    with engine_tx() as conn:
        latest = max_last_seen(conn, ds)
        if latest is not None:
            print(latest.isoformat())
示例#25
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.get(dataset)
    context = Context(dataset)
    context.normalize()
    for entity in dataset.store:
        write_object(outfile, entity)
示例#26
0
文件: cli.py 项目: nightsh/opennames
def build_analytics_(dataset):
    ds = Dataset.require(dataset)
    build_analytics(ds)
示例#27
0
def export(dataset):
    dataset = Dataset.get(dataset)
    for dataset_ in dataset.datasets:
        context = Context(dataset_)
        context.normalize()
        context.export()
示例#28
0
文件: cli.py 项目: nightsh/opennames
def clear(dataset):
    dataset = Dataset.require(dataset)
    for source in dataset.sources:
        Context(source).clear()
示例#29
0
import json
from banal import ensure_list
from functools import lru_cache
from pantomime.types import JSON
from requests.exceptions import TooManyRedirects

from opensanctions.core import Dataset
from opensanctions import helpers as h

FORMATS = ["%d %b %Y", "%d %B %Y", "%Y", "%b %Y", "%B %Y"]
SDN = Dataset.require("us_ofac_sdn")


@lru_cache(maxsize=None)
def deref_url(context, url):
    try:
        res = context.http.get(url, stream=True)
        return res.url
    except TooManyRedirects:
        return url


def parse_result(context, result):
    type_ = result.pop("type", None)
    schema = context.lookup_value("type", type_)
    if schema is None:
        context.log.error("Unknown result type", type=type_)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(result.pop("id"))
示例#30
0
class IndexResponse(BaseModel):
    datasets: List[str] = Dataset.names()
    model: ModelToDict
    terms: int = Field(..., example=23)
    tokens: int = Field(..., example=42)