def build_analytics(dataset: Dataset): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) db = Database(dataset, resolver) loader = db.view(dataset) with engine_tx() as conn: conn.execute(delete(analytics_dataset_table)) conn.execute(delete(analytics_country_table)) conn.execute(delete(analytics_entity_table)) entities: List[Dict[str, Any]] = [] members: List[Dict[str, str]] = [] countries: List[Dict[str, str]] = [] for idx, entity in enumerate(loader): if idx > 0 and idx % 10000 == 0: log.info("Denormalised %d entities..." % idx) for dataset in Dataset.all(): if len(entity.datasets.intersection(dataset.scope_names)) > 0: members.append({"entity_id": entity.id, "dataset": dataset.name}) if len(members) >= BATCH_SIZE: stmt = insert(analytics_dataset_table).values(members) conn.execute(stmt) members = [] for country in entity.get_type_values(registry.country): countries.append({"entity_id": entity.id, "country": country}) if len(countries) >= BATCH_SIZE: stmt = insert(analytics_country_table).values(countries) conn.execute(stmt) countries = [] ent = { "id": entity.id, "schema": entity.schema.name, "caption": entity.caption, "target": entity.target, "first_seen": entity.first_seen, "last_seen": entity.last_seen, "properties": entity.properties, } entities.append(ent) if len(entities) >= BATCH_SIZE: stmt = insert(analytics_entity_table).values(entities) conn.execute(stmt) entities = [] if len(members): conn.execute(insert(analytics_dataset_table).values(members)) if len(entities): conn.execute(insert(analytics_entity_table).values(entities))
def clear(self) -> None: """Delete all recorded data for a given dataset.""" with engine_tx() as conn: clear_statements(conn, self.dataset) clear_issues(conn, self.dataset) clear_resources(conn, self.dataset) self.cache.clear()
def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def explode(canonical_id): resolver = get_resolver() resolved_id = resolver.get_canonical(canonical_id) with engine_tx() as conn: for entity_id in resolver.explode(resolved_id): log.info("Restore separate entity", entity=entity_id) resolve_canonical(conn, resolver, entity_id) resolver.save()
def enrich( self, resolver: Resolver, entities: Iterable[Entity], threshold: Optional[float] = None, ): """Try to match a set of entities against an external source.""" self.bind() with engine_tx() as conn: clear_issues(conn, self.dataset) clear_resources(conn, self.dataset) clear_statements(conn, self.dataset) external = cast(External, self.dataset) enricher = external.get_enricher(self.cache) try: for entity in entities: try: for match in enricher.match_wrapped(entity): judgement = resolver.get_judgement(match.id, entity.id) # For unjudged candidates, compute a score and put it in the # xref cache so the user can decide: if judgement == Judgement.NO_JUDGEMENT: if not entity.schema.can_match(match.schema): continue result = compare_scored(entity, match) score = result["score"] if threshold is None or score >= threshold: self.log.info("Match [%s]: %.2f -> %s" % (entity, score, match)) resolver.suggest( entity.id, match.id, score, user=AUTO_USER, ) if judgement != Judgement.POSITIVE: self.emit(match, external=True) # Store previously confirmed matches to the database and make # them visible: if judgement == Judgement.POSITIVE: self.log.info("Enrich [%s]: %r" % (entity, match)) for adjacent in enricher.expand_wrapped( entity, match): if check_person_cutoff(adjacent): continue # self.log.info("Added", entity=adjacent) self.emit(adjacent) except Exception: self.log.exception("Could not match: %r" % entity) except KeyboardInterrupt: pass finally: self.flush() enricher.close() self.close()
def flush(self) -> None: """Emitted entities are de-constructed into statements for the database to store. These are inserted in batches - so the statement cache on the context is flushed to the store. All statements that are not flushed when a crawl is aborted are not persisted to the database.""" statements = list(self._statements.values()) with engine_tx() as conn: for i in range(0, len(statements), self.BATCH_SIZE): batch = statements[i:i + self.BATCH_SIZE] save_statements(conn, batch) self._statements = {}
def crawl(self) -> None: """Run the crawler.""" self.bind() with engine_tx() as conn: clear_issues(conn, self.dataset) if self.dataset.disabled: self.log.info("Source is disabled") return with engine_tx() as conn: clear_resources(conn, self.dataset) self.log.info("Begin crawl") try: # Run the dataset: self.dataset.method(self) self.flush() with engine_tx() as conn: cleanup_dataset(conn, self.dataset) entities = count_entities(conn, dataset=self.dataset) targets = count_entities(conn, dataset=self.dataset, target=True) self.log.info("Crawl completed", entities=entities, targets=targets) except KeyboardInterrupt: raise except LookupException as exc: self.log.error(exc.message, lookup=exc.lookup.name, value=exc.value) raise except Exception: self.log.exception("Crawl failed") raise finally: self.close()
def decide( self, left_id: StrIdent, right_id: StrIdent, judgement: Judgement, user: Optional[str] = None, score: Optional[float] = None, ) -> Identifier: target = super().decide(left_id, right_id, judgement, user=user, score=score) if judgement == Judgement.POSITIVE: with engine_tx() as conn: resolve_canonical(conn, self, target.id) return target
def store_log_event(logger, log_method, data: Dict[str, Any]) -> Dict[str, Any]: for key, value in data.items(): if isinstance(value, _Element): value = tostring(value, pretty_print=False, encoding=str) if isinstance(value, Path): value = str(value.relative_to(settings.DATA_PATH)) if isinstance(value, Schema): value = value.name data[key] = value dataset = data.get("dataset", None) level = data.get("level") if level is not None: level_num = getattr(logging, level.upper()) if level_num > logging.INFO and dataset is not None: with engine_tx() as conn: save_issue(conn, data) return data
def export_resource(self, path, mime_type=None, title=None): """Register a file as a documented file exported by the dataset.""" if mime_type is None: mime_type, _ = mimetypes.guess(path) digest = hashlib.sha1() size = 0 with open(path, "rb") as fh: while True: chunk = fh.read(65536) if not chunk: break size += len(chunk) digest.update(chunk) if size == 0: self.log.warning("Resource is empty", path=path) checksum = digest.hexdigest() name = path.relative_to(self.path).as_posix() with engine_tx() as conn: return save_resource(conn, name, self.dataset, checksum, mime_type, size, title)
def resolve(): resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver)
def latest(dataset): ds = Dataset.require(dataset) with engine_tx() as conn: latest = max_last_seen(conn, ds) if latest is not None: print(latest.isoformat())