Пример #1
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
Пример #2
0
 def __init__(self, dataset):
     self.dataset = dataset
     self.path = settings.DATASET_PATH.joinpath(dataset.name)
     self.http = get_session()
     self.resolver = get_resolver()
     self.log = structlog.get_logger(dataset.name)
     self._statements = {}
Пример #3
0
def export(dataset):
    resolver = get_resolver()
    Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
Пример #4
0
def explode(canonical_id):
    resolver = get_resolver()
    resolved_id = resolver.get_canonical(canonical_id)
    with engine_tx() as conn:
        for entity_id in resolver.explode(resolved_id):
            log.info("Restore separate entity", entity=entity_id)
            resolve_canonical(conn, resolver, entity_id)
    resolver.save()
Пример #5
0
def audit_resolver():
    wd = get_wikidata_enricher()
    resolver = get_resolver()

    log.info("Loading all entity IDs...")
    with engine_read() as conn:
        entities = list(entities_datasets(conn))

    entity_ids = set([e for e, _ in entities])
    log.info("Loaded %d entity IDs..." % len(entity_ids))

    canonicals = list(resolver.canonicals())
    for idx, canonical in enumerate(canonicals):
        if idx > 0 and idx % 10000 == 0:
            log.info("Processed: %d..." % idx)
        members = resolver.connected(canonical)
        qids = set()
        ofacs = 0
        for member in members:
            if is_qid(member.id):
                qid = member.id
                item = wd.fetch_item(qid)
                if item is None:
                    log.error("Missing WD item", qid=qid)
                if item.id != qid:
                    item = wd.fetch_item(qid, cache_days=0)
                    judgement = resolver.get_judgement(member, item.id)
                    if judgement != Judgement.POSITIVE:
                        resolver.decide(
                            member,
                            item.id,
                            judgement=Judgement.POSITIVE,
                            user="******",
                        )
                    qid = item.id
                qids.add(qid)

            if member.id.startswith("ofac-"):
                ofacs += 1
            if member.id not in entity_ids:
                if member.canonical:
                    continue
                log.warn(
                    "Referenced entity does not exist",
                    canonical=canonical.id,
                    entity=member.id,
                )
                resolver.remove(member)

        if ofacs > 1:
            log.warning("More than one OFAC ID",
                        id=canonical.id,
                        size=len(members))

        if len(qids) > 1:
            log.error("Entity has more than one QID", qids=qids)

    resolver.save()
Пример #6
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
Пример #7
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
Пример #8
0
def build_analytics(dataset: Dataset):
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    with engine_tx() as conn:
        conn.execute(delete(analytics_dataset_table))
        conn.execute(delete(analytics_country_table))
        conn.execute(delete(analytics_entity_table))

        entities: List[Dict[str, Any]] = []
        members: List[Dict[str, str]] = []
        countries: List[Dict[str, str]] = []
        for idx, entity in enumerate(loader):
            if idx > 0 and idx % 10000 == 0:
                log.info("Denormalised %d entities..." % idx)

            for dataset in Dataset.all():
                if len(entity.datasets.intersection(dataset.scope_names)) > 0:
                    members.append({"entity_id": entity.id, "dataset": dataset.name})

            if len(members) >= BATCH_SIZE:
                stmt = insert(analytics_dataset_table).values(members)
                conn.execute(stmt)
                members = []

            for country in entity.get_type_values(registry.country):
                countries.append({"entity_id": entity.id, "country": country})

            if len(countries) >= BATCH_SIZE:
                stmt = insert(analytics_country_table).values(countries)
                conn.execute(stmt)
                countries = []

            ent = {
                "id": entity.id,
                "schema": entity.schema.name,
                "caption": entity.caption,
                "target": entity.target,
                "first_seen": entity.first_seen,
                "last_seen": entity.last_seen,
                "properties": entity.properties,
            }
            entities.append(ent)

            if len(entities) >= BATCH_SIZE:
                stmt = insert(analytics_entity_table).values(entities)
                conn.execute(stmt)
                entities = []

        if len(members):
            conn.execute(insert(analytics_dataset_table).values(members))

        if len(entities):
            conn.execute(insert(analytics_entity_table).values(entities))
Пример #9
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
Пример #10
0
def run(dataset):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    for source in dataset.sources:
        Context(source).crawl()
    Statement.resolve_all(resolver)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
Пример #11
0
def blocking_xref(dataset: Dataset, limit: int = 5000):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver, cached=True, external=True)
    loader = db.view(dataset)
    xref(
        loader,
        resolver,
        limit=limit,
        scored=True,
        auto_threshold=0.990,
        user=AUTO_USER,
    )
    resolver.save()
Пример #12
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
Пример #13
0
def merge(entity_ids):
    if len(entity_ids) < 2:
        return
    resolver = get_resolver()
    canonical_id = resolver.get_canonical(entity_ids[0])
    for other_id in entity_ids[1:]:
        other_id = Identifier.get(other_id)
        other_canonical_id = resolver.get_canonical(other_id)
        if other_canonical_id == canonical_id:
            continue
        check = resolver.check_candidate(canonical_id, other_id)
        if not check:
            log.error(
                "Cannot merge",
                canonical_id=canonical_id,
                other_id=other_id,
                edge=resolver.get_resolved_edge(canonical_id, other_id),
            )
            return
        log.info("Merge: %s -> %s" % (other_id, canonical_id))
        canonical_id = resolver.decide(canonical_id, other_id,
                                       Judgement.POSITIVE)
    resolver.save()
    log.info("Canonical: %s" % canonical_id)
Пример #14
0
def explode(canonical_id):
    resolver = get_resolver()
    resolved_id = resolver.get_canonical(canonical_id)
    resolver.explode(resolved_id)
    resolver.save()
    Statement.resolve_all(resolver)
Пример #15
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    loader = Database(dataset, resolver).view(dataset)
    for entity in loader:
        write_object(outfile, entity)
Пример #16
0
def xref_prune():
    resolver = get_resolver()
    resolver.prune()
    resolver.save()
Пример #17
0
def resolve():
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
Пример #18
0
def resolve():
    resolver = get_resolver()
    Statement.resolve_all(resolver)
Пример #19
0
from followthemoney.schema import Schema
from followthemoney.property import Property
from nomenklatura.index.index import Index
from nomenklatura.loader import Loader
from opensanctions.model import Statement
from opensanctions.core.entity import Entity
from opensanctions.core.dataset import Dataset
from opensanctions.core.index import get_index as get_dataset_index
from opensanctions.core.resolver import get_resolver
from opensanctions.core.loader import Database

from osapi import settings
from osapi.models import FreebaseType
from osapi.models import FreebaseEntity, FreebaseProperty

resolver = get_resolver()


def get_scope() -> Dataset:
    scope = Dataset.get(settings.SCOPE_DATASET)
    if scope is None:
        raise RuntimeError("Cannot load dataset: %s" % settings.SCOPE_DATASET)
    return scope


@cache
def get_database() -> Database:
    return Database(get_scope(), resolver, cached=settings.CACHED)


@cache
Пример #20
0
def xref_prune(keep=0):
    resolver = get_resolver()
    resolver.prune(keep=keep)
    resolver.save()