def crawl(self) -> None: """Run the crawler.""" try: self.bind() Issue.clear(self.dataset) Resource.clear(self.dataset) db.session.commit() self.log.info("Begin crawl") # Run the dataset: self.dataset.method(self) self.flush() Statement.cleanup_dataset(self.dataset) self.log.info( "Crawl completed", entities=Statement.all_counts(dataset=self.dataset), targets=Statement.all_counts(dataset=self.dataset, target=True), ) except KeyboardInterrupt: db.session.rollback() raise except LookupException as exc: db.session.rollback() self.log.error(exc.message, lookup=exc.lookup.name, value=exc.value) except Exception: db.session.rollback() self.log.exception("Crawl failed") finally: self.close()
def flush(self) -> None: """Emitted entities are de-constructed into statements for the database to store. These are inserted in batches - so the statement cache on the context is flushed to the store. All statements that are not flushed when a crawl is aborted are not persisted to the database.""" self.log.debug("Flushing statements to database...") Statement.upsert_many(list(self._statements.values())) self._statements = {}
def query(self, dataset: Dataset, entity_id=None, inverted_id=None) -> Generator[CachedEntity, None, None]: """Query the statement table for the given dataset and entity ID and return an entity cache object with the given properties.""" canonical_id = None if entity_id is not None: canonical_id = self.resolver.get_canonical(entity_id) inverted_ids = None if inverted_id is not None: inverted_ids = self.resolver.get_referents(inverted_id) current_id = None types: List[CachedType] = [] props: List[CachedProp] = [] q = Statement.all_statements( dataset=dataset, canonical_id=canonical_id, inverted_ids=inverted_ids, ) for stmt in q: if stmt.canonical_id != current_id: if len(types): yield (tuple(types), tuple(props)) types = [] props = [] current_id = stmt.canonical_id if stmt.prop == Statement.BASE: types.append(CachedType(stmt)) else: props.append(CachedProp(stmt)) if len(types): yield (tuple(types), tuple(props))
def decide( self, left_id: StrIdent, right_id: StrIdent, judgement: Judgement, user: Optional[str] = None, score: Optional[float] = None, ) -> Identifier: target = super().decide(left_id, right_id, judgement, user=user, score=score) if judgement == Judgement.POSITIVE: Statement.resolve(self, target.id) return target
def export_global_index(): """Export the global index for all datasets.""" datasets = [] for dataset in Dataset.all(): datasets.append(dataset.to_index()) issues_path = settings.DATASET_PATH.joinpath("issues.json") log.info("Writing global issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query().all()} write_json(data, fh) index_path = settings.DATASET_PATH.joinpath("index.json") log.info("Writing global index", datasets=len(datasets), path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = { "datasets": datasets, "run_time": settings.RUN_TIME, "dataset_url": settings.DATASET_URL, "issues_url": urljoin(settings.DATASET_URL, "issues.json"), "model": model, "schemata": Statement.all_schemata(), "app": "opensanctions", "version": settings.VERSION, } write_json(meta, fh)
def get_schemata(dataset: Dataset) -> List[Schema]: schemata: List[Schema] = list() names = Statement.all_schemata(dataset=dataset) for name in names: schema = model.get(name) if schema is not None: schemata.append(schema) return schemata
def check_candidate(self, left: Identifier, right: Identifier) -> bool: if not super().check_candidate(left, right): return False lefts = [c.id for c in self.connected(left)] rights = [c.id for c in self.connected(right)] if Statement.unique_conflict(lefts, rights): self.decide(left, right, Judgement.NEGATIVE, user="******") return False return True
def get_target_countries(self) -> List[Dict[str, Any]]: countries = [] for code, count in Statement.agg_target_by_country(dataset=self): result = { "code": code, "count": count, "label": registry.country.caption(code), } countries.append(result) return countries
def to_index(self) -> Dict[str, Any]: meta = self.to_dict() meta["index_url"] = self.make_public_url("index.json") meta["issues_url"] = self.make_public_url("issues.json") meta["issue_levels"] = Issue.agg_by_level(dataset=self) meta["issue_count"] = sum(meta["issue_levels"].values()) meta["target_count"] = Statement.all_counts(dataset=self, target=True) meta["last_change"] = Statement.max_last_seen(dataset=self) meta["last_export"] = settings.RUN_TIME meta["targets"] = { "countries": self.get_target_countries(), "schemata": self.get_target_schemata(), } meta["resources"] = [] for resource in Resource.query(dataset=self): res = resource.to_dict() res["url"] = self.make_public_url(resource.path) meta["resources"].append(res) return meta
def export_pairs(dataset: Dataset): resolver = get_resolver() db = Database(dataset, resolver, cached=True) datasets: Dict[str, Dataset] = defaultdict(set) for entity_id, ds in Statement.entities_datasets(dataset): dsa = Dataset.get(ds) if dsa is not None: datasets[entity_id].add(dsa) def get_parts(id): canonical_id = resolver.get_canonical(id) for ref in resolver.get_referents(canonical_id): for ds in datasets.get(ref, []): yield ref, ds pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]], Judgement] = {} for canonical_id in resolver.canonicals(): parts = list(get_parts(canonical_id)) for left, right in combinations(parts, 2): left, right = max(left, right), min(left, right) pairs[(left, right)] = Judgement.POSITIVE for edge in resolver.nodes[canonical_id]: if edge.judgement == Judgement.NEGATIVE: source_canonical = resolver.get_canonical(edge.source) other = edge.target if source_canonical == canonical_id else edge.source for other_part in get_parts(other): for part in parts: part, other_part = max(part, other_part), min( part, other_part) pairs[(part, other_part)] = Judgement.NEGATIVE def get_partial(spec): id, ds = spec loader = db.view(ds) canonical = resolver.get_canonical(id) entity = loader.get_entity(canonical) if entity is not None: return entity.to_nested_dict(loader) for (left, right), judgement in pairs.items(): # yield [left[0], right[0], judgement] left_entity = get_partial(left) right_entity = get_partial(right) if left_entity is not None and right_entity is not None: yield { "left": left_entity, "right": right_entity, "judgement": judgement }
def get_target_schemata(self) -> List[Dict[str, Any]]: schemata = [] for name, count in Statement.agg_target_by_schema(dataset=self): schema = model.get(name) if schema is None: continue result = { "name": name, "count": count, "label": schema.label, "plural": schema.plural, } schemata.append(result) return schemata
def emit(self, entity: Entity, target: Optional[bool] = None, unique: bool = False): """Send an FtM entity to the store.""" if entity.id is None: raise ValueError("Entity has no ID: %r", entity) if target is not None: entity.target = target statements = Statement.from_entity( entity, self.dataset, self.resolver, unique=unique ) if not len(statements): raise ValueError("Entity has no properties: %r", entity) for stmt in statements: key = (stmt["entity_id"], stmt["prop"], stmt["value"]) self._statements[key] = stmt if len(self._statements) >= db.batch_size: self.flush() self.log.debug("Emitted", entity=entity)
def __len__(self) -> int: return Statement.all_ids(self.dataset).count()
def clear(self) -> None: """Delete all recorded data for a given dataset.""" Issue.clear(self.dataset) Statement.clear(self.dataset) db.session.commit()