def make_entities(db_path, outfile): db = dataset.connect("sqlite:///%s" % db_path) store = Dataset("temp", database_uri="sqlite://") writer = store.bulk() write_edges(writer, db) write_addresses(writer, db) write_nodes(writer, db["entity"], "Company") write_nodes(writer, db["intermediary"]) write_nodes(writer, db["officer"]) for entity in store.iterate(): write_object(outfile, entity)
def _stream_collection(collection, N=50_000): fid = collection["foreign_id"] collection_id = collection["id"] cachefile = CACHEDIR / f"{fid}.json" if not cachefile.exists(): return cachefile.parent.mkdir(parents=True, exist_ok=True) cachefile_back = CACHEDIR / "tmp.json" dataset = Dataset(f"collection_{collection_id}", origin="aleph") with open(cachefile_back, "w+") as fd: for entity in islice(dataset.iterate(skip_errors=True), N): yield entity fd.write(json.dumps(entity.to_dict())) fd.write("\n") cachefile_back.rename(cachefile) else: with open(cachefile) as fd: for line in fd: yield json.loads(line)
def handle(self, task): name = task.context.get("ftmstore", task.job.dataset.name) entity_ids = task.payload.get("entity_ids") dataset = Dataset(name, OP_TRANSLATE) try: writer = dataset.bulk() for entity in dataset.partials(entity_id=entity_ids): self.translate(writer, entity) writer.flush() self.dispatch_next(task, entity_ids) finally: dataset.close()
class EntityEmitter(object): def __init__(self, context): self.fragment = 0 self.log = context.log self.name = context.crawler.name self.dataset = Dataset(self.name, origin=ORIGIN) self.bulk = self.dataset.bulk() def make(self, schema): entity = model.make_entity(schema, key_prefix=self.name) return entity def emit(self, entity, rule='pass'): if entity.id is None: raise RuntimeError("Entity has no ID: %r", entity) self.bulk.put(entity, fragment=str(self.fragment)) self.fragment += 1 def finalize(self): self.bulk.flush()
except Exception: log.exception("Failed to parse: %r", member) def parse_archive(writer, archive_path): log.info("Archive: %s", archive_path) tar = tarfile.open(archive_path, "r") while True: member = tar.next() if member is None: break fh = tar.extractfile(member) if fh is None: continue parse_file(writer, fh, member) fh.close() writer.flush() if __name__ == "__main__": prefix = "data/" dataset = Dataset("pa_companies", origin="parse") writer = dataset.bulk() for file_name in sorted(os.listdir(prefix)): file_path = os.path.join(prefix, file_name) parse_archive(writer, file_path) with open("panama.json", "w") as fh: for entity in dataset.iterate(): write_object(fh, entity)
def get_aggregator(collection, origin="aleph"): """Connect to a followthemoney dataset.""" return Dataset(get_aggregator_name(collection), origin=origin)
def get_dataset(name, origin, database_uri=None): database_uri = database_uri or settings.DATABASE_URI return Dataset(name, origin, database_uri=database_uri)
def __init__(self, context): self.fragment = 0 self.log = context.log self.name = context.crawler.name self.dataset = Dataset(self.name, origin=ORIGIN) self.bulk = self.dataset.bulk()