def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def export_global_index(): """Export the global index for all datasets.""" datasets = [] for dataset in Dataset.all(): datasets.append(dataset.to_index()) issues_path = settings.DATASET_PATH.joinpath("issues.json") log.info("Writing global issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query().all()} write_json(data, fh) index_path = settings.DATASET_PATH.joinpath("index.json") log.info("Writing global index", datasets=len(datasets), path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = { "datasets": datasets, "run_time": settings.RUN_TIME, "dataset_url": settings.DATASET_URL, "issues_url": urljoin(settings.DATASET_URL, "issues.json"), "model": model, "schemata": Statement.all_schemata(), "app": "opensanctions", "version": settings.VERSION, } write_json(meta, fh)
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" context = Context(dataset) context.bind() loader = database.view(dataset, export_assembler) exporters = [Exporter(context, loader) for Exporter in EXPORTERS] for entity in loader: for exporter in exporters: exporter.feed(entity) for exporter in exporters: exporter.finish() # Make sure the exported resources are visible in the database db.session.commit() # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query(dataset=dataset).all()} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = dataset.to_index() write_json(meta, fh) context.close()
def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def export(dataset): resolver = get_resolver() Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def run(dataset): dataset = Dataset.get(dataset) for source in dataset.sources: Context(source).crawl() for dataset_ in dataset.datasets: context = Context(dataset_) context.normalize() context.export()
def index(dataset): resolver = get_resolver() # Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) loader = database.view(dataset) path = get_index_path(dataset) path.unlink(missing_ok=True) get_index(dataset, loader)
def run(dataset): dataset = Dataset.require(dataset) resolver = get_resolver() for source in dataset.sources: Context(source).crawl() Statement.resolve_all(resolver) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver) DedupeApp.run( title="OpenSanction De-duplication", # log="textual.log", loader=db.view(dataset), resolver=resolver, )
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver, external=True) loader = db.view(dataset) async def run_app() -> None: app = DedupeApp( loader=loader, resolver=resolver, url_base="https://opensanctions.org/entities/%s/", title="OpenSanction De-duplication", log="textual.log", ) # type: ignore await app.process_messages() asyncio.run(run_app())
def feed(self, entity: Entity): if not entity.target: return countries = set(entity.get_type_values(registry.country)) identifiers = set(entity.get_type_values(registry.identifier)) names = set(entity.get_type_values(registry.name)) names.discard(entity.caption) sanctions = set() addresses = set(entity.get("address")) for _, adjacent in self.loader.get_adjacent(entity): if adjacent.schema.is_a("Sanction"): sanctions.add(self.sanction_text(adjacent)) if adjacent.schema.is_a("Address"): addresses.add(adjacent.caption) if adjacent.schema.is_a("Identification"): identifiers.update(adjacent.get("number")) countries.update(adjacent.get("country")) datasets: List[str] = [] for dataset in entity.datasets: ds = Dataset.require(dataset) datasets.append(ds.title) row = [ entity.id, entity.schema.name, entity.caption, self.concat_values(names), self.concat_values(entity.get("birthDate", quiet=True)), self.concat_values(countries), self.concat_values(addresses), self.concat_values(identifiers), self.concat_values(sanctions), self.concat_values(entity.get_type_values(registry.phone)), self.concat_values(entity.get_type_values(registry.email)), self.concat_values(datasets), entity.first_seen, entity.last_seen, ] self.writer.writerow(row)
from opensanctions.core import Dataset, Context, setup from opensanctions.exporters.statements import export_statements_path from opensanctions.exporters.statements import import_statements_path from opensanctions.core.audit import audit_resolver from opensanctions.core.loader import Database from opensanctions.core.resolver import AUTO_USER, export_pairs, get_resolver from opensanctions.core.xref import blocking_xref from opensanctions.core.statements import max_last_seen from opensanctions.core.statements import resolve_all_canonical, resolve_canonical from opensanctions.core.analytics import build_analytics from opensanctions.core.db import engine_tx from opensanctions.processing import run_enrich, run_pipeline from opensanctions.util import write_json log = get_logger(__name__) datasets = click.Choice(Dataset.names()) @click.group(help="OpenSanctions ETL toolkit") @click.option("-v", "--verbose", is_flag=True, default=False) @click.option("-q", "--quiet", is_flag=True, default=False) def cli(verbose=False, quiet=False): level = logging.INFO if quiet: level = logging.WARNING if verbose: level = logging.DEBUG setup(log_level=level) @cli.command("crawl", help="Crawl entities into the given dataset")
def xref_int(dataset): xref_internal(Dataset.require(dataset))
def export_pairs_(dataset, outfile): dataset = Dataset.require(dataset) for obj in export_pairs(dataset): write_object(outfile, obj)
def xref(dataset, limit): dataset = Dataset.require(dataset) blocking_xref(dataset, limit=limit)
def xref(base, candidates, limit=15): base_dataset = Dataset.require(base) candidates_dataset = Dataset.require(candidates) xref_datasets(base_dataset, candidates_dataset, limit=limit)
def dump_dataset(dataset, outfile): dataset = Dataset.require(dataset) resolver = get_resolver() loader = Database(dataset, resolver).view(dataset) for entity in loader: write_object(outfile, entity)
def crawl(dataset): dataset = Dataset.get(dataset) for source in dataset.sources: Context(source).crawl()
@click.group(help="OpenSanctions ETL toolkit") @click.option("-v", "--verbose", is_flag=True, default=False) @click.option("-q", "--quiet", is_flag=True, default=False) def cli(verbose=False, quiet=False): level = logging.INFO if quiet: level = logging.ERROR if verbose: level = logging.DEBUG setup(log_level=level) @cli.command("dump", help="Export the entities from a dataset") @click.argument("dataset", default=Dataset.ALL, type=click.Choice(Dataset.names())) @click.option("-o", "--outfile", type=click.File("w"), default="-") def dump_dataset(dataset, outfile): dataset = Dataset.get(dataset) context = Context(dataset) context.normalize() for entity in dataset.store: write_object(outfile, entity) @cli.command("crawl", help="Crawl entities into the given dataset") @click.argument("dataset", default=Dataset.ALL, type=click.Choice(Dataset.names())) def crawl(dataset): dataset = Dataset.get(dataset)
def crawl(dataset): dataset = Dataset.require(dataset) for source in dataset.sources: Context(source).crawl()
def lookup(name, value): # We don't want to duplicate the lookup configs in both YAML files, # so we're hard-coding that lookups go against the SDN config. sdn = Dataset.require("us_ofac_sdn") return sdn.lookups.get(name).match(value)
def export_pairs_(dataset, outfile): dataset = Dataset.require(dataset) for obj in export_pairs(dataset): write_json(obj, outfile)
def dump_dataset(dataset, outfile): dataset = Dataset.get(dataset) for source in dataset.sources: # TODO: consolidate the data for entity in source.store: write_object(outfile, entity)
def latest(dataset): ds = Dataset.require(dataset) with engine_tx() as conn: latest = max_last_seen(conn, ds) if latest is not None: print(latest.isoformat())
def dump_dataset(dataset, outfile): dataset = Dataset.get(dataset) context = Context(dataset) context.normalize() for entity in dataset.store: write_object(outfile, entity)
def build_analytics_(dataset): ds = Dataset.require(dataset) build_analytics(ds)
def export(dataset): dataset = Dataset.get(dataset) for dataset_ in dataset.datasets: context = Context(dataset_) context.normalize() context.export()
def clear(dataset): dataset = Dataset.require(dataset) for source in dataset.sources: Context(source).clear()
import json from banal import ensure_list from functools import lru_cache from pantomime.types import JSON from requests.exceptions import TooManyRedirects from opensanctions.core import Dataset from opensanctions import helpers as h FORMATS = ["%d %b %Y", "%d %B %Y", "%Y", "%b %Y", "%B %Y"] SDN = Dataset.require("us_ofac_sdn") @lru_cache(maxsize=None) def deref_url(context, url): try: res = context.http.get(url, stream=True) return res.url except TooManyRedirects: return url def parse_result(context, result): type_ = result.pop("type", None) schema = context.lookup_value("type", type_) if schema is None: context.log.error("Unknown result type", type=type_) return entity = context.make(schema) entity.id = context.make_slug(result.pop("id"))
class IndexResponse(BaseModel): datasets: List[str] = Dataset.names() model: ModelToDict terms: int = Field(..., example=23) tokens: int = Field(..., example=42)