def test_multiple_keys(self): mapping = model.make_mapping({ "csv_url": 'http://pets.com', "entities": { "test": { "schema": "Person", "key": ['b', 'a'] } } }) entities = mapping.map({'a': 'aaa', 'b': 'bbb'}) ent0 = entities.get('test') assert ent0.id == sha1(b'aaabbb').hexdigest(), ent0 mapping = model.make_mapping({ "csv_url": 'http://pets.com', "entities": { "test": { "schema": "Person", "key": ['a', 'b'] } } }) entities = mapping.map({'a': 'aaa', 'b': 'bbb'}) ent0 = entities.get('test') assert ent0.id == sha1(b'aaabbb').hexdigest(), ent0
def load_query(): try: query = request.json.get("mapping_query", {}) # just for validation model.make_mapping({"entities": query}) except Exception as ex: log.exception("Validation error: %s", request.json) raise BadRequest(str(ex)) return query
def mapping(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) except InvalidMapping as invalid: raise BadRequest(invalid) queue_task(collection, OP_BULKLOAD, payload=data) return ('', 202)
def mapping_process(id): collection = get_db_collection(id, request.authz.WRITE) require(request.authz.is_admin) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in dict_list(data, 'queries', 'query'): try: model.make_mapping(query) bulk_load_query.apply_async([collection.id, query], priority=6) except InvalidMapping as invalid: raise BadRequest(invalid) return ('', 204)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) entities = {} total = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity_id = entity.get('id') if entity_id is None: continue # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. base = entities.get(entity_id, {}) entities[entity_id] = merge_data(entity, base) total += 1 if idx % 1000 == 0: log.info("[%s] Loaded %s records, %s entities...", collection.foreign_id, idx, total) if len(entities) >= BULK_PAGE: index_bulk(collection, entities, chunk_size=BULK_PAGE) entities = {} if len(entities): index_bulk(collection, entities, chunk_size=BULK_PAGE) # Update collection stats index_collection(collection)
def map_to_aggregator(collection, mapping, aggregator): table = get_entity(mapping.table_id) if table is None: table = aggregator.get(mapping.table_id) if table is None: raise RuntimeError("Table cannot be found: %s" % mapping.table_id) config = {"csv_url": _get_table_csv_link(table), "entities": mapping.query} mapper = model.make_mapping(config, key_prefix=collection.foreign_id) origin = mapping_origin(mapping.id) aggregator.delete(origin=origin) writer = aggregator.bulk() idx = 0 for idx, record in enumerate(mapper.source.records, 1): if idx > 0 and idx % 1000 == 0: log.info("[%s] Mapped %s rows ...", mapping.id, idx) for entity in mapper.map(record).values(): entity.context = mapping.get_proxy_context() if entity.schema.is_a("Thing"): entity.add("proof", mapping.table_id) entity = collection.ns.apply(entity) entity = remove_checksums(entity) writer.put(entity, fragment=idx, origin=origin) if mapping.entityset is not None: EntitySetItem.save( mapping.entityset, entity.id, collection_id=collection.id, added_by_id=mapping.role_id, ) writer.flush() log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) # Update collection stats index_collection(collection)
def stream_mapping(infile: Path, outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: with path_writer(outfile) as outfh: with input_file(infile) as fh: for record in CSVSource.read_csv(fh): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): # type: ignore entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def bulk_load_query(queue, collection, query_id, query): namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) if records_total: queue.progress.mark_pending(records_total) aggregator = get_aggregator(collection) writer = aggregator.bulk() entities_count = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) entities_count += 1 fragment = '%s-%s' % (query_id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 1000 == 0: queue.progress.mark_finished(1000) log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, idx, records_total or 'streaming', entities_count) writer.flush() aggregator.close() log.info("[%s] Query done (%s entities)", collection.foreign_id, entities_count)
def map_to_aggregator(collection, mapping, aggregator): table = get_entity(mapping.table_id) if table is None: table = aggregator.get(mapping.table_id) if table is None: raise RuntimeError("Table cannot be found: %s" % mapping.table_id) config = { 'csv_url': _get_table_csv_link(table), 'entities': mapping.query } mapper = model.make_mapping(config, key_prefix=collection.foreign_id) origin = mapping_origin(mapping.id) aggregator.delete(origin=origin) writer = aggregator.bulk() for idx, record in enumerate(mapper.source.records, 1): if idx > 0 and idx % 1000 == 0: log.info("[%s] Mapped %s rows ...", mapping.id, idx) for entity in mapper.map(record).values(): entity.context = mapping.get_proxy_context() entity.context['mutable'] = True if entity.schema.is_a('Thing'): entity.add('proof', mapping.table_id) entity = collection.ns.apply(entity) entity = remove_checksums(entity) writer.put(entity, fragment=idx, origin=origin) writer.flush() log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
def mapping_process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) # TODO: we need to look into possible abuse of mapping load path for local # path access on the machine running the mapping. Until then, this action # must be restricted to admins: require(request.authz.is_admin) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) bulk_load_query.apply_async([collection.id, query], priority=6) except InvalidMapping as invalid: raise BadRequest(invalid) return ('', 204)
def mapping_process(id): collection = get_db_collection(id, request.authz.WRITE) require(request.authz.can_bulk_import()) # TODO: we need to look into possible abuse of mapping load path for local # path access on the machine running the mapping. Until then, this action # must be restricted to admins: require(request.authz.is_admin) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) bulk_load_query.apply_async([collection.id, query], priority=6) except InvalidMapping as invalid: raise BadRequest(invalid) return ('', 204)
def test_multiple_keys(self): mapping = model.make_mapping( { "csv_url": "http://pets.com", "entities": {"test": {"schema": "Person", "key": ["b", "a"]}}, } ) entities = mapping.map({"a": "aaa", "b": "bbb"}) ent0 = entities.get("test") assert ent0.id == sha1(b"aaabbb").hexdigest(), ent0 mapping = model.make_mapping( { "csv_url": "http://pets.com", "entities": {"test": {"schema": "Person", "key": ["a", "b"]}}, } ) entities = mapping.map({"a": "aaa", "b": "bbb"}) ent0 = entities.get("test") assert ent0.id == sha1(b"aaabbb").hexdigest(), ent0
def test_key_column_from_sql(self): mapping = self.kek_mapping del mapping["entities"]["company"]["keys"] mapping["entities"]["company"]["id_column"] = "comp.id" mapped = model.make_mapping(mapping) assert len(mapped.source) == 2904, len(mapped.source) assert len(mapped.entities) == 3, mapped.entities assert len(mapped.refs) == 7, mapped.refs entities = list(model.map_entities(mapping)) self.assertGreaterEqual(int(entities[0].id), 3000) # FIXME?
def test_key_column_from_sql(self): mapping = self.kek_mapping del mapping['entities']['company']['keys'] mapping['entities']['company']['id_column'] = 'comp.id' mapped = model.make_mapping(mapping) assert len(mapped.source) == 2904, len(mapped.source) assert len(mapped.entities) == 3, mapped.entities assert len(mapped.refs) == 7, mapped.refs entities = list(model.map_entities(mapping)) self.assertGreaterEqual(int(entities[0].id), 3000) # FIXME?
def make_mapper(collection, mapping): table = get_entity(mapping.table_id) properties = table.get('properties', {}) csv_hash = first(properties.get('csvHash')) if csv_hash is None: raise RuntimeError("Source table doesn't have a CSV version") url = archive.generate_url(csv_hash) if not url: local_path = archive.load_file(csv_hash) if local_path is not None: url = local_path.as_posix() if url is None: raise RuntimeError("Could not generate CSV URL for the table") data = {'csv_url': url, 'entities': mapping.query} return model.make_mapping(data, key_prefix=collection.foreign_id)
def test_key_literal(self): mapping = model.make_mapping({ "csv_url": 'http://pets.com', "entities": { "test": { "schema": "Person", "key_literal": "test", "key": ['a', 'b'] } } }) entities = mapping.map({}) assert len(entities) == 0, entities.keys() entities = mapping.map({'a': 'aaa', 'b': 'bbb'}) ent0 = entities.get('test') assert ent0.id == sha1(b'testaaabbb').hexdigest(), ent0
def test_key_generation(self): mapping = model.make_mapping( { "csv_url": "http://pets.com", "entities": {"test": {"schema": "Person", "key": "id"}}, } ) for ent in mapping.entities: seed = ent.seed.hexdigest() assert seed == sha1(b"").hexdigest(), seed entities = mapping.map({}) assert len(entities) == 0, entities.keys() entities = mapping.map({"id": "foo"}) assert len(entities) == 1, entities.keys() ent0 = entities.get("test") assert ent0.id == sha1(b"foo").hexdigest(), ent0
def stream_mapping(infile, outfile, mapping_yaml): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(infile): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def test_kek_map_single(self): mapping = model.make_mapping(self.kek_mapping) assert len(mapping.entities) == 3, mapping.entities assert len(mapping.refs) == 7, mapping.refs record = { 'comp.id': 4, 'sub.id': '7.4', 'comp.name': 'Pets.com Ltd', 'shares.share': '40%', 'comp.url': 'https://pets.com', 'sub.name': 'DogFood Sales Corp.', 'comp.address': '10 Broadstreet, 20388 Washington, DC' } entities = mapping.map(record) assert len(entities) == 3, entities.keys() company = entities.get('company') assert company['id'], company assert record['comp.name'] in company['properties']['name'], company
def test_key_literal(self): mapping = model.make_mapping( { "csv_url": "http://pets.com", "entities": { "test": { "schema": "Person", "key_literal": "test", "key": ["a", "b"], } }, } ) entities = mapping.map({}) assert len(entities) == 0, entities.keys() entities = mapping.map({"a": "aaa", "b": "bbb"}) ent0 = entities.get("test") assert ent0.id == sha1(b"testaaabbb").hexdigest(), ent0
def test_kek_map_single(self): mapping = model.make_mapping(self.kek_mapping) assert len(mapping.source) == 2904, len(mapping.source) assert len(mapping.entities) == 3, mapping.entities assert len(mapping.refs) == 7, mapping.refs record = { "comp.id": 4, "sub.id": "7.4", "comp.name": "Pets.com Ltd", "shares.share": "40%", "comp.url": "https://pets.com", "sub.name": "DogFood Sales Corp.", "comp.address": "10 Broadstreet, 20388 Washington, DC", } entities = mapping.map(record) assert len(entities) == 3, entities.keys() company = entities.get("company") assert company.id, company assert record["comp.name"] in company.get("name"), company
def test_key_generation(self): mapping = model.make_mapping({ "csv_url": 'http://pets.com', "entities": { "test": { "schema": "Person", "key": "id" } } }) for ent in mapping.entities: seed = ent.seed.hexdigest() assert seed == sha1(b'').hexdigest(), seed entities = mapping.map({}) assert len(entities) == 0, entities.keys() entities = mapping.map({'id': 'foo'}) assert len(entities) == 1, entities.keys() ent0 = entities.get('test') assert ent0.id == sha1(b'foo').hexdigest(), ent0
def stream_mapping(infile, outfile, mapping_yaml, sign=True): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append((dataset, source)) try: for record in StreamSource.read_csv(infile): for (dataset, source) in sources: ns = Namespace(dataset) if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def stream_mapping(mapping_yaml): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(stdin): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): read_entity(stdout, entity) except BrokenPipeError: raise click.Abort()
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def make_mapper(collection, mapping): url = get_table_csv_link(mapping.table_id) data = {'csv_url': url, 'entities': mapping.query} return model.make_mapping(data, key_prefix=collection.foreign_id)