Пример #1
0
    def test_multiple_keys(self):
        mapping = model.make_mapping({
            "csv_url": 'http://pets.com',
            "entities": {
                "test": {
                    "schema": "Person",
                    "key": ['b', 'a']
                }
            }
        })
        entities = mapping.map({'a': 'aaa', 'b': 'bbb'})
        ent0 = entities.get('test')
        assert ent0.id == sha1(b'aaabbb').hexdigest(), ent0

        mapping = model.make_mapping({
            "csv_url": 'http://pets.com',
            "entities": {
                "test": {
                    "schema": "Person",
                    "key": ['a', 'b']
                }
            }
        })
        entities = mapping.map({'a': 'aaa', 'b': 'bbb'})
        ent0 = entities.get('test')
        assert ent0.id == sha1(b'aaabbb').hexdigest(), ent0
Пример #2
0
def load_query():
    try:
        query = request.json.get("mapping_query", {})
        # just for validation
        model.make_mapping({"entities": query})
    except Exception as ex:
        log.exception("Validation error: %s", request.json)
        raise BadRequest(str(ex))
    return query
Пример #3
0
def mapping(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    queue_task(collection, OP_BULKLOAD, payload=data)
    return ('', 202)
Пример #4
0
def mapping_process(id):
    collection = get_db_collection(id, request.authz.WRITE)
    require(request.authz.is_admin)
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in dict_list(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
            bulk_load_query.apply_async([collection.id, query], priority=6)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    return ('', 204)
Пример #5
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    entities = {}
    total = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity_id = entity.get('id')
            if entity_id is None:
                continue
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            base = entities.get(entity_id, {})
            entities[entity_id] = merge_data(entity, base)
            total += 1

        if idx % 1000 == 0:
            log.info("[%s] Loaded %s records, %s entities...",
                     collection.foreign_id, idx, total)

        if len(entities) >= BULK_PAGE:
            index_bulk(collection, entities, chunk_size=BULK_PAGE)
            entities = {}

    if len(entities):
        index_bulk(collection, entities, chunk_size=BULK_PAGE)

    # Update collection stats
    index_collection(collection)
Пример #6
0
def map_to_aggregator(collection, mapping, aggregator):
    table = get_entity(mapping.table_id)
    if table is None:
        table = aggregator.get(mapping.table_id)
    if table is None:
        raise RuntimeError("Table cannot be found: %s" % mapping.table_id)
    config = {"csv_url": _get_table_csv_link(table), "entities": mapping.query}
    mapper = model.make_mapping(config, key_prefix=collection.foreign_id)
    origin = mapping_origin(mapping.id)
    aggregator.delete(origin=origin)
    writer = aggregator.bulk()
    idx = 0
    for idx, record in enumerate(mapper.source.records, 1):
        if idx > 0 and idx % 1000 == 0:
            log.info("[%s] Mapped %s rows ...", mapping.id, idx)
        for entity in mapper.map(record).values():
            entity.context = mapping.get_proxy_context()
            if entity.schema.is_a("Thing"):
                entity.add("proof", mapping.table_id)
            entity = collection.ns.apply(entity)
            entity = remove_checksums(entity)
            writer.put(entity, fragment=idx, origin=origin)
            if mapping.entityset is not None:
                EntitySetItem.save(
                    mapping.entityset,
                    entity.id,
                    collection_id=collection.id,
                    added_by_id=mapping.role_id,
                )
    writer.flush()
    log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
Пример #7
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    # Update collection stats
    index_collection(collection)
Пример #8
0
def stream_mapping(infile: Path,
                   outfile: Path,
                   mapping_yaml: Path,
                   sign: bool = True) -> None:
    queries: List[Tuple[str, QueryMapping]] = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            data.pop("database", None)
            data["csv_url"] = "/dev/null"
            query = model.make_mapping(data, key_prefix=dataset)
            queries.append((dataset, query))

    try:
        with path_writer(outfile) as outfh:
            with input_file(infile) as fh:
                for record in CSVSource.read_csv(fh):
                    for (dataset, query) in queries:
                        ns = Namespace(dataset)
                        if query.source.check_filters(record):  # type: ignore
                            entities = query.map(record)
                            for entity in entities.values():
                                if sign:
                                    entity = ns.apply(entity)
                                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
Пример #9
0
def bulk_load_query(queue, collection, query_id, query):
    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source)
    if records_total:
        queue.progress.mark_pending(records_total)
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entities_count = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            entities_count += 1
            fragment = '%s-%s' % (query_id, idx)
            writer.put(entity, fragment=fragment)

        if idx > 0 and idx % 1000 == 0:
            queue.progress.mark_finished(1000)
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, idx, records_total or 'streaming',
                     entities_count)
    writer.flush()
    aggregator.close()
    log.info("[%s] Query done (%s entities)", collection.foreign_id,
             entities_count)
Пример #10
0
def map_to_aggregator(collection, mapping, aggregator):
    table = get_entity(mapping.table_id)
    if table is None:
        table = aggregator.get(mapping.table_id)
    if table is None:
        raise RuntimeError("Table cannot be found: %s" % mapping.table_id)
    config = {
        'csv_url': _get_table_csv_link(table),
        'entities': mapping.query
    }
    mapper = model.make_mapping(config, key_prefix=collection.foreign_id)
    origin = mapping_origin(mapping.id)
    aggregator.delete(origin=origin)
    writer = aggregator.bulk()
    for idx, record in enumerate(mapper.source.records, 1):
        if idx > 0 and idx % 1000 == 0:
            log.info("[%s] Mapped %s rows ...", mapping.id, idx)
        for entity in mapper.map(record).values():
            entity.context = mapping.get_proxy_context()
            entity.context['mutable'] = True
            if entity.schema.is_a('Thing'):
                entity.add('proof', mapping.table_id)
            entity = collection.ns.apply(entity)
            entity = remove_checksums(entity)
            writer.put(entity, fragment=idx, origin=origin)
    writer.flush()
    log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
Пример #11
0
def mapping_process(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    # TODO: we need to look into possible abuse of mapping load path for local
    # path access on the machine running the mapping. Until then, this action
    # must be restricted to admins:
    require(request.authz.is_admin)
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
            bulk_load_query.apply_async([collection.id, query], priority=6)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    return ('', 204)
Пример #12
0
def mapping_process(id):
    collection = get_db_collection(id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    # TODO: we need to look into possible abuse of mapping load path for local
    # path access on the machine running the mapping. Until then, this action
    # must be restricted to admins:
    require(request.authz.is_admin)
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
            bulk_load_query.apply_async([collection.id, query], priority=6)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    return ('', 204)
Пример #13
0
    def test_multiple_keys(self):
        mapping = model.make_mapping(
            {
                "csv_url": "http://pets.com",
                "entities": {"test": {"schema": "Person", "key": ["b", "a"]}},
            }
        )
        entities = mapping.map({"a": "aaa", "b": "bbb"})
        ent0 = entities.get("test")
        assert ent0.id == sha1(b"aaabbb").hexdigest(), ent0

        mapping = model.make_mapping(
            {
                "csv_url": "http://pets.com",
                "entities": {"test": {"schema": "Person", "key": ["a", "b"]}},
            }
        )
        entities = mapping.map({"a": "aaa", "b": "bbb"})
        ent0 = entities.get("test")
        assert ent0.id == sha1(b"aaabbb").hexdigest(), ent0
Пример #14
0
    def test_key_column_from_sql(self):
        mapping = self.kek_mapping
        del mapping["entities"]["company"]["keys"]
        mapping["entities"]["company"]["id_column"] = "comp.id"

        mapped = model.make_mapping(mapping)
        assert len(mapped.source) == 2904, len(mapped.source)
        assert len(mapped.entities) == 3, mapped.entities
        assert len(mapped.refs) == 7, mapped.refs
        entities = list(model.map_entities(mapping))
        self.assertGreaterEqual(int(entities[0].id), 3000)  # FIXME?
Пример #15
0
    def test_key_column_from_sql(self):
        mapping = self.kek_mapping
        del mapping['entities']['company']['keys']
        mapping['entities']['company']['id_column'] = 'comp.id'

        mapped = model.make_mapping(mapping)
        assert len(mapped.source) == 2904, len(mapped.source)
        assert len(mapped.entities) == 3, mapped.entities
        assert len(mapped.refs) == 7, mapped.refs
        entities = list(model.map_entities(mapping))
        self.assertGreaterEqual(int(entities[0].id), 3000)  # FIXME?
Пример #16
0
def make_mapper(collection, mapping):
    table = get_entity(mapping.table_id)
    properties = table.get('properties', {})
    csv_hash = first(properties.get('csvHash'))
    if csv_hash is None:
        raise RuntimeError("Source table doesn't have a CSV version")
    url = archive.generate_url(csv_hash)
    if not url:
        local_path = archive.load_file(csv_hash)
        if local_path is not None:
            url = local_path.as_posix()
    if url is None:
        raise RuntimeError("Could not generate CSV URL for the table")
    data = {'csv_url': url, 'entities': mapping.query}
    return model.make_mapping(data, key_prefix=collection.foreign_id)
Пример #17
0
 def test_key_literal(self):
     mapping = model.make_mapping({
         "csv_url": 'http://pets.com',
         "entities": {
             "test": {
                 "schema": "Person",
                 "key_literal": "test",
                 "key": ['a', 'b']
             }
         }
     })
     entities = mapping.map({})
     assert len(entities) == 0, entities.keys()
     entities = mapping.map({'a': 'aaa', 'b': 'bbb'})
     ent0 = entities.get('test')
     assert ent0.id == sha1(b'testaaabbb').hexdigest(), ent0
Пример #18
0
    def test_key_generation(self):
        mapping = model.make_mapping(
            {
                "csv_url": "http://pets.com",
                "entities": {"test": {"schema": "Person", "key": "id"}},
            }
        )
        for ent in mapping.entities:
            seed = ent.seed.hexdigest()
            assert seed == sha1(b"").hexdigest(), seed

        entities = mapping.map({})
        assert len(entities) == 0, entities.keys()
        entities = mapping.map({"id": "foo"})
        assert len(entities) == 1, entities.keys()
        ent0 = entities.get("test")
        assert ent0.id == sha1(b"foo").hexdigest(), ent0
Пример #19
0
def stream_mapping(infile, outfile, mapping_yaml):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        for record in StreamSource.read_csv(infile):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Пример #20
0
 def test_kek_map_single(self):
     mapping = model.make_mapping(self.kek_mapping)
     assert len(mapping.entities) == 3, mapping.entities
     assert len(mapping.refs) == 7, mapping.refs
     record = {
         'comp.id': 4,
         'sub.id': '7.4',
         'comp.name': 'Pets.com Ltd',
         'shares.share': '40%',
         'comp.url': 'https://pets.com',
         'sub.name': 'DogFood Sales Corp.',
         'comp.address': '10 Broadstreet, 20388 Washington, DC'
     }
     entities = mapping.map(record)
     assert len(entities) == 3, entities.keys()
     company = entities.get('company')
     assert company['id'], company
     assert record['comp.name'] in company['properties']['name'], company
Пример #21
0
 def test_key_literal(self):
     mapping = model.make_mapping(
         {
             "csv_url": "http://pets.com",
             "entities": {
                 "test": {
                     "schema": "Person",
                     "key_literal": "test",
                     "key": ["a", "b"],
                 }
             },
         }
     )
     entities = mapping.map({})
     assert len(entities) == 0, entities.keys()
     entities = mapping.map({"a": "aaa", "b": "bbb"})
     ent0 = entities.get("test")
     assert ent0.id == sha1(b"testaaabbb").hexdigest(), ent0
Пример #22
0
 def test_kek_map_single(self):
     mapping = model.make_mapping(self.kek_mapping)
     assert len(mapping.source) == 2904, len(mapping.source)
     assert len(mapping.entities) == 3, mapping.entities
     assert len(mapping.refs) == 7, mapping.refs
     record = {
         "comp.id": 4,
         "sub.id": "7.4",
         "comp.name": "Pets.com Ltd",
         "shares.share": "40%",
         "comp.url": "https://pets.com",
         "sub.name": "DogFood Sales Corp.",
         "comp.address": "10 Broadstreet, 20388 Washington, DC",
     }
     entities = mapping.map(record)
     assert len(entities) == 3, entities.keys()
     company = entities.get("company")
     assert company.id, company
     assert record["comp.name"] in company.get("name"), company
Пример #23
0
    def test_key_generation(self):
        mapping = model.make_mapping({
            "csv_url": 'http://pets.com',
            "entities": {
                "test": {
                    "schema": "Person",
                    "key": "id"
                }
            }
        })
        for ent in mapping.entities:
            seed = ent.seed.hexdigest()
            assert seed == sha1(b'').hexdigest(), seed

        entities = mapping.map({})
        assert len(entities) == 0, entities.keys()
        entities = mapping.map({'id': 'foo'})
        assert len(entities) == 1, entities.keys()
        ent0 = entities.get('test')
        assert ent0.id == sha1(b'foo').hexdigest(), ent0
Пример #24
0
def stream_mapping(infile, outfile, mapping_yaml, sign=True):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append((dataset, source))

    try:
        for record in StreamSource.read_csv(infile):
            for (dataset, source) in sources:
                ns = Namespace(dataset)
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        if sign:
                            entity = ns.apply(entity)
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Пример #25
0
def stream_mapping(mapping_yaml):
    stdin = click.get_text_stream('stdin')
    stdout = click.get_text_stream('stdout')

    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        for record in StreamSource.read_csv(stdin):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        read_entity(stdout, entity)
    except BrokenPipeError:
        raise click.Abort()
Пример #26
0
Файл: bulk.py Проект: pudo/aleph
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
Пример #27
0
def make_mapper(collection, mapping):
    url = get_table_csv_link(mapping.table_id)
    data = {'csv_url': url, 'entities': mapping.query}
    return model.make_mapping(data, key_prefix=collection.foreign_id)