Пример #1
0
def make_entities(db_path, outfile):
    db = dataset.connect("sqlite:///%s" % db_path)
    store = Dataset("temp", database_uri="sqlite://")
    writer = store.bulk()
    write_edges(writer, db)
    write_addresses(writer, db)
    write_nodes(writer, db["entity"], "Company")
    write_nodes(writer, db["intermediary"])
    write_nodes(writer, db["officer"])

    for entity in store.iterate():
        write_object(outfile, entity)
Пример #2
0
def _stream_collection(collection, N=50_000):
    fid = collection["foreign_id"]
    collection_id = collection["id"]
    cachefile = CACHEDIR / f"{fid}.json"
    if not cachefile.exists():
        return
        cachefile.parent.mkdir(parents=True, exist_ok=True)
        cachefile_back = CACHEDIR / "tmp.json"
        dataset = Dataset(f"collection_{collection_id}", origin="aleph")
        with open(cachefile_back, "w+") as fd:
            for entity in islice(dataset.iterate(skip_errors=True), N):
                yield entity
                fd.write(json.dumps(entity.to_dict()))
                fd.write("\n")
        cachefile_back.rename(cachefile)
    else:
        with open(cachefile) as fd:
            for line in fd:
                yield json.loads(line)
Пример #3
0
 def handle(self, task):
     name = task.context.get("ftmstore", task.job.dataset.name)
     entity_ids = task.payload.get("entity_ids")
     dataset = Dataset(name, OP_TRANSLATE)
     try:
         writer = dataset.bulk()
         for entity in dataset.partials(entity_id=entity_ids):
             self.translate(writer, entity)
         writer.flush()
         self.dispatch_next(task, entity_ids)
     finally:
         dataset.close()
Пример #4
0
class EntityEmitter(object):

    def __init__(self, context):
        self.fragment = 0
        self.log = context.log
        self.name = context.crawler.name
        self.dataset = Dataset(self.name, origin=ORIGIN)
        self.bulk = self.dataset.bulk()

    def make(self, schema):
        entity = model.make_entity(schema, key_prefix=self.name)
        return entity

    def emit(self, entity, rule='pass'):
        if entity.id is None:
            raise RuntimeError("Entity has no ID: %r", entity)
        self.bulk.put(entity, fragment=str(self.fragment))
        self.fragment += 1

    def finalize(self):
        self.bulk.flush()
Пример #5
0
    except Exception:
        log.exception("Failed to parse: %r", member)


def parse_archive(writer, archive_path):
    log.info("Archive: %s", archive_path)
    tar = tarfile.open(archive_path, "r")
    while True:
        member = tar.next()
        if member is None:
            break
        fh = tar.extractfile(member)
        if fh is None:
            continue
        parse_file(writer, fh, member)
        fh.close()
    writer.flush()


if __name__ == "__main__":
    prefix = "data/"
    dataset = Dataset("pa_companies", origin="parse")
    writer = dataset.bulk()
    for file_name in sorted(os.listdir(prefix)):
        file_path = os.path.join(prefix, file_name)
        parse_archive(writer, file_path)

    with open("panama.json", "w") as fh:
        for entity in dataset.iterate():
            write_object(fh, entity)
Пример #6
0
def get_aggregator(collection, origin="aleph"):
    """Connect to a followthemoney dataset."""
    return Dataset(get_aggregator_name(collection), origin=origin)
Пример #7
0
def get_dataset(name, origin, database_uri=None):
    database_uri = database_uri or settings.DATABASE_URI
    return Dataset(name, origin, database_uri=database_uri)
Пример #8
0
 def __init__(self, context):
     self.fragment = 0
     self.log = context.log
     self.name = context.crawler.name
     self.dataset = Dataset(self.name, origin=ORIGIN)
     self.bulk = self.dataset.bulk()