def load_ba_fixtures(config): # This is messy. Would be cool to do it more cleanly, but how? if not len(BA_FIXTURES['entities']): with open(os.path.join(FIXTURES, 'ba.mapping.yaml'), 'rb') as fh: mapping = yaml.load(fh) mapper = Mapper(mapping, config.resolver, scope=config.base_uri) with open(os.path.join(FIXTURES, 'ba.csv'), 'rb') as csvfh: reader = unicodecsv.DictReader(csvfh) for row in reader: _, data = mapper.apply(row) BA_FIXTURES['entities'].append(data) source = Source.ensure({ 'slug': BA_SOURCE, 'title': 'BiH Parliament', 'url': 'http://foo.ba/' }) permission = Permission() permission.role_id = Role.SYSTEM_USER permission.read = True permission.write = False permission.resource_id = source.id permission.resource_type = Permission.SOURCE session.add(permission) session.commit() for entity in BA_FIXTURES['entities']: config.entities.save(entity['$schema'], entity, source_id=source.id) get_loom_indexer().index(source=BA_SOURCE)
def test_sa_term26_flatten(self): mapping, uri = fixture_uri('everypol/mapping.json') resolver.store[uri] = mapping csvobj = fixture_file('everypol/term-26.csv') mapped = list(csv_mapper(csvobj, mapping, resolver)) objs = [o for (o, err) in mapped] for row in Mapper.flatten_iter(objs, mapping, resolver): assert 'group_id' in row, row assert 'email' in row, row
def csv_mapper(fileobj, mapping, resolver=None, scope=None): """ Given a CSV file object (fh), parse the file as a unicode CSV document, iterate over all rows of the data and map them to a JSON schema using the mapping instructions in ``mapping``. """ from jsonmapping import Mapper reader = unicodecsv.DictReader(fileobj) for row in Mapper.apply_iter(reader, mapping, resolver=resolver, scope=scope): yield row
def map_row(csvfile, mapfile, columns=None): """ Generator function that transforms a CSV row into a mapped dictionary object, one row at a time """ mapping = file_to_json(mapfile) resolver = RefResolver.from_schema(mapping) mapper = Mapper(mapping, resolver) drop_blank = lambda p, k, v: v is not None and v != "" and not_empty(v) total_rows = row_count(csvfile) if isinstance(csvfile, str): csvfp = open(csvfile, 'r', encoding='utf-8-sig') for row in tqdm(csv.DictReader(csvfp), total=total_rows): row = {key: value for key, value in row.items() if key in columns} \ if columns else row _, data = mapper.apply(row) data = remap(data, visit=drop_blank) yield data
def records(self, mapping_name): mapper = SchemaMapper(self.spec.get_mapping(mapping_name), self.config.resolver, scope=self.config.base_uri) schema = mapper.visitor.schema.get('id') begin = time.time() stmts = 0 for i, row in enumerate(self.generator.generate(mapping_name)): _, data = mapper.apply(row) for stmt in self.config.entities.triplify(schema, data): stmts += 1 yield stmt if i > 0 and i % 10000 == 0: elapsed = time.time() - begin per_record = float(elapsed) / float(i) speed = per_record * 1000 log.info("Generating %r: %s records (%s, %.2fms/r)", mapping_name, i, stmts, speed)
def csv_mapper(fileobj, mapping, resolver=None, scope=None): """ Given a CSV file object (fh), parse the file as a unicode CSV document, iterate over all rows of the data and map them to a JSON schema using the mapping instructions in ``mapping``. """ reader = unicodecsv.DictReader(fileobj) for (row, err) in Mapper.apply_iter(reader, mapping, resolver=resolver, scope=scope): yield (row, err)
def load_mapped_csv(graph, csv_uri, mapping, context_id=None): """ Load data from a CSV file, applying a JSON mapping and then adding it to the graph. """ meta = {'source_url': csv_uri} reader = unicodecsv.DictReader(read_uri(csv_uri)) ctx = graph.context(identifier=context_id, meta=meta) for data, err in Mapper.apply_iter(reader, mapping, graph.resolver, scope=graph.base_uri): if err is not None: log.warning("Error loading %r: %r", csv_uri, err) else: ctx.add(data['$schema'], data) ctx.save()