示例#1
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
示例#2
0
 def __init__(self, dataset):
     self.dataset = dataset
     self.path = settings.DATASET_PATH.joinpath(dataset.name)
     self.http = get_session()
     self.resolver = get_resolver()
     self.log = structlog.get_logger(dataset.name)
     self._statements = {}
示例#3
0
def export(dataset):
    resolver = get_resolver()
    Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
示例#4
0
文件: cli.py 项目: nightsh/opennames
def explode(canonical_id):
    resolver = get_resolver()
    resolved_id = resolver.get_canonical(canonical_id)
    with engine_tx() as conn:
        for entity_id in resolver.explode(resolved_id):
            log.info("Restore separate entity", entity=entity_id)
            resolve_canonical(conn, resolver, entity_id)
    resolver.save()
示例#5
0
def audit_resolver():
    wd = get_wikidata_enricher()
    resolver = get_resolver()

    log.info("Loading all entity IDs...")
    with engine_read() as conn:
        entities = list(entities_datasets(conn))

    entity_ids = set([e for e, _ in entities])
    log.info("Loaded %d entity IDs..." % len(entity_ids))

    canonicals = list(resolver.canonicals())
    for idx, canonical in enumerate(canonicals):
        if idx > 0 and idx % 10000 == 0:
            log.info("Processed: %d..." % idx)
        members = resolver.connected(canonical)
        qids = set()
        ofacs = 0
        for member in members:
            if is_qid(member.id):
                qid = member.id
                item = wd.fetch_item(qid)
                if item is None:
                    log.error("Missing WD item", qid=qid)
                if item.id != qid:
                    item = wd.fetch_item(qid, cache_days=0)
                    judgement = resolver.get_judgement(member, item.id)
                    if judgement != Judgement.POSITIVE:
                        resolver.decide(
                            member,
                            item.id,
                            judgement=Judgement.POSITIVE,
                            user="******",
                        )
                    qid = item.id
                qids.add(qid)

            if member.id.startswith("ofac-"):
                ofacs += 1
            if member.id not in entity_ids:
                if member.canonical:
                    continue
                log.warn(
                    "Referenced entity does not exist",
                    canonical=canonical.id,
                    entity=member.id,
                )
                resolver.remove(member)

        if ofacs > 1:
            log.warning("More than one OFAC ID",
                        id=canonical.id,
                        size=len(members))

        if len(qids) > 1:
            log.error("Entity has more than one QID", qids=qids)

    resolver.save()
示例#6
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
示例#7
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
示例#8
0
def build_analytics(dataset: Dataset):
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    with engine_tx() as conn:
        conn.execute(delete(analytics_dataset_table))
        conn.execute(delete(analytics_country_table))
        conn.execute(delete(analytics_entity_table))

        entities: List[Dict[str, Any]] = []
        members: List[Dict[str, str]] = []
        countries: List[Dict[str, str]] = []
        for idx, entity in enumerate(loader):
            if idx > 0 and idx % 10000 == 0:
                log.info("Denormalised %d entities..." % idx)

            for dataset in Dataset.all():
                if len(entity.datasets.intersection(dataset.scope_names)) > 0:
                    members.append({"entity_id": entity.id, "dataset": dataset.name})

            if len(members) >= BATCH_SIZE:
                stmt = insert(analytics_dataset_table).values(members)
                conn.execute(stmt)
                members = []

            for country in entity.get_type_values(registry.country):
                countries.append({"entity_id": entity.id, "country": country})

            if len(countries) >= BATCH_SIZE:
                stmt = insert(analytics_country_table).values(countries)
                conn.execute(stmt)
                countries = []

            ent = {
                "id": entity.id,
                "schema": entity.schema.name,
                "caption": entity.caption,
                "target": entity.target,
                "first_seen": entity.first_seen,
                "last_seen": entity.last_seen,
                "properties": entity.properties,
            }
            entities.append(ent)

            if len(entities) >= BATCH_SIZE:
                stmt = insert(analytics_entity_table).values(entities)
                conn.execute(stmt)
                entities = []

        if len(members):
            conn.execute(insert(analytics_dataset_table).values(members))

        if len(entities):
            conn.execute(insert(analytics_entity_table).values(entities))
示例#9
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
示例#10
0
def run(dataset):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    for source in dataset.sources:
        Context(source).crawl()
    Statement.resolve_all(resolver)
    database = Database(dataset, resolver, cached=True)
    for dataset_ in dataset.datasets:
        export_dataset(dataset_, database)
    export_global_index()
示例#11
0
def blocking_xref(dataset: Dataset, limit: int = 5000):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver, cached=True, external=True)
    loader = db.view(dataset)
    xref(
        loader,
        resolver,
        limit=limit,
        scored=True,
        auto_threshold=0.990,
        user=AUTO_USER,
    )
    resolver.save()
示例#12
0
文件: cli.py 项目: nightsh/opennames
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
示例#13
0
文件: cli.py 项目: nightsh/opennames
def merge(entity_ids):
    if len(entity_ids) < 2:
        return
    resolver = get_resolver()
    canonical_id = resolver.get_canonical(entity_ids[0])
    for other_id in entity_ids[1:]:
        other_id = Identifier.get(other_id)
        other_canonical_id = resolver.get_canonical(other_id)
        if other_canonical_id == canonical_id:
            continue
        check = resolver.check_candidate(canonical_id, other_id)
        if not check:
            log.error(
                "Cannot merge",
                canonical_id=canonical_id,
                other_id=other_id,
                edge=resolver.get_resolved_edge(canonical_id, other_id),
            )
            return
        log.info("Merge: %s -> %s" % (other_id, canonical_id))
        canonical_id = resolver.decide(canonical_id, other_id,
                                       Judgement.POSITIVE)
    resolver.save()
    log.info("Canonical: %s" % canonical_id)
示例#14
0
def explode(canonical_id):
    resolver = get_resolver()
    resolved_id = resolver.get_canonical(canonical_id)
    resolver.explode(resolved_id)
    resolver.save()
    Statement.resolve_all(resolver)
示例#15
0
def dump_dataset(dataset, outfile):
    dataset = Dataset.require(dataset)
    resolver = get_resolver()
    loader = Database(dataset, resolver).view(dataset)
    for entity in loader:
        write_object(outfile, entity)
示例#16
0
文件: cli.py 项目: nightsh/opennames
def xref_prune():
    resolver = get_resolver()
    resolver.prune()
    resolver.save()
示例#17
0
文件: cli.py 项目: nightsh/opennames
def resolve():
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
示例#18
0
def resolve():
    resolver = get_resolver()
    Statement.resolve_all(resolver)
示例#19
0
from followthemoney.schema import Schema
from followthemoney.property import Property
from nomenklatura.index.index import Index
from nomenklatura.loader import Loader
from opensanctions.model import Statement
from opensanctions.core.entity import Entity
from opensanctions.core.dataset import Dataset
from opensanctions.core.index import get_index as get_dataset_index
from opensanctions.core.resolver import get_resolver
from opensanctions.core.loader import Database

from osapi import settings
from osapi.models import FreebaseType
from osapi.models import FreebaseEntity, FreebaseProperty

resolver = get_resolver()


def get_scope() -> Dataset:
    scope = Dataset.get(settings.SCOPE_DATASET)
    if scope is None:
        raise RuntimeError("Cannot load dataset: %s" % settings.SCOPE_DATASET)
    return scope


@cache
def get_database() -> Database:
    return Database(get_scope(), resolver, cached=settings.CACHED)


@cache
示例#20
0
def xref_prune(keep=0):
    resolver = get_resolver()
    resolver.prune(keep=keep)
    resolver.save()