Exemplo n.º 1
0
def make_entities(db_path, outfile):
    db = dataset.connect("sqlite:///%s" % db_path)
    store = Dataset("temp", database_uri="sqlite://")
    writer = store.bulk()
    write_edges(writer, db)
    write_addresses(writer, db)
    write_nodes(writer, db["entity"], "Company")
    write_nodes(writer, db["intermediary"])
    write_nodes(writer, db["officer"])

    for entity in store.iterate():
        write_object(outfile, entity)
Exemplo n.º 2
0
 def handle(self, task):
     name = task.context.get("ftmstore", task.job.dataset.name)
     entity_ids = task.payload.get("entity_ids")
     dataset = Dataset(name, OP_TRANSLATE)
     try:
         writer = dataset.bulk()
         for entity in dataset.partials(entity_id=entity_ids):
             self.translate(writer, entity)
         writer.flush()
         self.dispatch_next(task, entity_ids)
     finally:
         dataset.close()
Exemplo n.º 3
0
class EntityEmitter(object):

    def __init__(self, context):
        self.fragment = 0
        self.log = context.log
        self.name = context.crawler.name
        self.dataset = Dataset(self.name, origin=ORIGIN)
        self.bulk = self.dataset.bulk()

    def make(self, schema):
        entity = model.make_entity(schema, key_prefix=self.name)
        return entity

    def emit(self, entity, rule='pass'):
        if entity.id is None:
            raise RuntimeError("Entity has no ID: %r", entity)
        self.bulk.put(entity, fragment=str(self.fragment))
        self.fragment += 1

    def finalize(self):
        self.bulk.flush()
Exemplo n.º 4
0
    except Exception:
        log.exception("Failed to parse: %r", member)


def parse_archive(writer, archive_path):
    log.info("Archive: %s", archive_path)
    tar = tarfile.open(archive_path, "r")
    while True:
        member = tar.next()
        if member is None:
            break
        fh = tar.extractfile(member)
        if fh is None:
            continue
        parse_file(writer, fh, member)
        fh.close()
    writer.flush()


if __name__ == "__main__":
    prefix = "data/"
    dataset = Dataset("pa_companies", origin="parse")
    writer = dataset.bulk()
    for file_name in sorted(os.listdir(prefix)):
        file_path = os.path.join(prefix, file_name)
        parse_archive(writer, file_path)

    with open("panama.json", "w") as fh:
        for entity in dataset.iterate():
            write_object(fh, entity)