예제 #1
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
예제 #2
0
def xref_datasets(base: Dataset, candidates: Dataset, limit: int = 15):
    resolver = get_resolver()
    resolver.prune()
    if candidates not in base.provided_datasets():
        raise RuntimeError("%r is not contained in %r" % (candidates, base))
    db = Database(base, resolver, cached=True)
    entities = db.view(candidates)
    loader = db.view(base)
    index = get_index(base, loader)
    xref(index, resolver, entities, limit=limit)
    resolver.save()
예제 #3
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    context = Context(dataset)
    context.bind()
    loader = database.view(dataset, export_assembler)
    exporters = [Exporter(context, loader) for Exporter in EXPORTERS]
    for entity in loader:
        for exporter in exporters:
            exporter.feed(entity)

    for exporter in exporters:
        exporter.finish()

    # Make sure the exported resources are visible in the database
    db.session.commit()

    # Export list of data issues from crawl stage
    issues_path = context.get_resource_path("issues.json")
    context.log.info("Writing dataset issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query(dataset=dataset).all()}
        write_json(data, fh)

    # Export full metadata
    index_path = context.get_resource_path("index.json")
    context.log.info("Writing dataset index", path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = dataset.to_index()
        write_json(meta, fh)

    context.close()
예제 #4
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
예제 #5
0
def index(dataset):
    resolver = get_resolver()
    # Statement.resolve_all(resolver)
    dataset = Dataset.require(dataset)
    database = Database(dataset, resolver, cached=True)
    loader = database.view(dataset)
    path = get_index_path(dataset)
    path.unlink(missing_ok=True)
    get_index(dataset, loader)
예제 #6
0
def build_analytics(dataset: Dataset):
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    with engine_tx() as conn:
        conn.execute(delete(analytics_dataset_table))
        conn.execute(delete(analytics_country_table))
        conn.execute(delete(analytics_entity_table))

        entities: List[Dict[str, Any]] = []
        members: List[Dict[str, str]] = []
        countries: List[Dict[str, str]] = []
        for idx, entity in enumerate(loader):
            if idx > 0 and idx % 10000 == 0:
                log.info("Denormalised %d entities..." % idx)

            for dataset in Dataset.all():
                if len(entity.datasets.intersection(dataset.scope_names)) > 0:
                    members.append({"entity_id": entity.id, "dataset": dataset.name})

            if len(members) >= BATCH_SIZE:
                stmt = insert(analytics_dataset_table).values(members)
                conn.execute(stmt)
                members = []

            for country in entity.get_type_values(registry.country):
                countries.append({"entity_id": entity.id, "country": country})

            if len(countries) >= BATCH_SIZE:
                stmt = insert(analytics_country_table).values(countries)
                conn.execute(stmt)
                countries = []

            ent = {
                "id": entity.id,
                "schema": entity.schema.name,
                "caption": entity.caption,
                "target": entity.target,
                "first_seen": entity.first_seen,
                "last_seen": entity.last_seen,
                "properties": entity.properties,
            }
            entities.append(ent)

            if len(entities) >= BATCH_SIZE:
                stmt = insert(analytics_entity_table).values(entities)
                conn.execute(stmt)
                entities = []

        if len(members):
            conn.execute(insert(analytics_dataset_table).values(members))

        if len(entities):
            conn.execute(insert(analytics_entity_table).values(entities))
예제 #7
0
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver)
    DedupeApp.run(
        title="OpenSanction De-duplication",
        # log="textual.log",
        loader=db.view(dataset),
        resolver=resolver,
    )
예제 #8
0
def blocking_xref(dataset: Dataset, limit: int = 5000):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver, cached=True, external=True)
    loader = db.view(dataset)
    xref(
        loader,
        resolver,
        limit=limit,
        scored=True,
        auto_threshold=0.990,
        user=AUTO_USER,
    )
    resolver.save()
예제 #9
0
파일: cli.py 프로젝트: nightsh/opennames
def dedupe(dataset):
    resolver = get_resolver()
    dataset = Dataset.require(dataset)
    db = Database(dataset, resolver, external=True)
    loader = db.view(dataset)

    async def run_app() -> None:
        app = DedupeApp(
            loader=loader,
            resolver=resolver,
            url_base="https://opensanctions.org/entities/%s/",
            title="OpenSanction De-duplication",
            log="textual.log",
        )  # type: ignore
        await app.process_messages()

    asyncio.run(run_app())
예제 #10
0
def xref_internal(dataset: Dataset):
    resolver = get_resolver()
    resolver.prune()
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    index = get_index(dataset, loader)
    suggested = 0
    for pair, score in index.pairs():
        left = loader.get_entity(str(pair[0]))
        right = loader.get_entity(str(pair[1]))
        if left is None or right is None:
            continue
        if left.schema not in right.schema.matchable_schemata:
            if right.schema not in left.schema.matchable_schemata:
                continue
        if not resolver.check_candidate(left.id, right.id):
            continue
        resolver.suggest(left.id, right.id, score)
        if suggested > 5000:
            break
        suggested += 1
    resolver.save()
예제 #11
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    try:
        context = Context(dataset)
        loader = database.view(dataset, assemble)
        if dataset.type != External.TYPE:
            export_data(context, loader)

        # Export list of data issues from crawl stage
        issues_path = context.get_resource_path("issues.json")
        context.log.info("Writing dataset issues list", path=issues_path)
        with open(issues_path, "wb") as fh:
            with engine.begin() as conn:
                data = {"issues": list(all_issues(conn, dataset))}
            write_json(data, fh)

        # Export full metadata
        index_path = context.get_resource_path("index.json")
        context.log.info("Writing dataset index", path=index_path)
        with open(index_path, "wb") as fh:
            meta = dataset_to_index(dataset)
            write_json(meta, fh)
    finally:
        context.close()