示例#1
0
    def __init__(self, model, query, name, data, key_prefix=None):
        self.model = model
        self.name = name
        self.data = data

        self.seed = sha1(key_bytes(key_prefix))
        self.seed.update(key_bytes(data.get('key_literal')))

        self.keys = keys_values(data, 'key', 'keys')
        if not len(self.keys):
            raise InvalidMapping("No keys: %r" % name)

        self.schema = model.get(data.get('schema'))
        if self.schema is None:
            raise InvalidMapping("Invalid schema: %s" % data.get('schema'))

        self.refs = set(self.keys)
        self.dependencies = set()
        self.properties = []
        for name, mapping in data.get('properties', {}).items():
            prop = self.schema.get(name)
            if prop is None:
                raise InvalidMapping("Invalid property: %s" % name)
            mapping = PropertyMapping(query, mapping, prop)
            self.properties.append(mapping)
            self.refs.update(mapping.refs)
            if mapping.entity:
                self.dependencies.add(mapping.entity)
示例#2
0
    def __init__(self, model, query, name, data, key_prefix=None):
        self.model = model
        self.name = name
        self.data = data

        self.seed = sha1(key_bytes(key_prefix))
        self.seed.update(key_bytes(data.get("key_literal")))

        self.keys = keys_values(data, "key", "keys")
        self.id_column = data.get("id_column")
        if not len(self.keys) and self.id_column is None:
            raise InvalidMapping("No keys or ID: %r" % name)
        if len(self.keys) and self.id_column is not None:
            msg = "Please use only keys or id_column, not both: %r" % name
            raise InvalidMapping(msg)

        self.schema = model.get(data.get("schema"))
        if self.schema is None:
            raise InvalidMapping("Invalid schema: %s" % data.get("schema"))

        self.refs = set(self.keys)
        if self.id_column:
            self.refs.add(self.id_column)
        self.dependencies = set()
        self.properties = []
        for name, mapping in data.get("properties", {}).items():
            prop = self.schema.get(name)
            if prop is None:
                raise InvalidMapping("Invalid property: %s" % name)
            mapping = PropertyMapping(query, mapping, prop)
            self.properties.append(mapping)
            self.refs.update(mapping.refs)
            if mapping.entity:
                self.dependencies.add(mapping.entity)
示例#3
0
def stream_mapping(infile: Path,
                   outfile: Path,
                   mapping_yaml: Path,
                   sign: bool = True) -> None:
    queries: List[Tuple[str, QueryMapping]] = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            data.pop("database", None)
            data["csv_url"] = "/dev/null"
            query = model.make_mapping(data, key_prefix=dataset)
            queries.append((dataset, query))

    try:
        with path_writer(outfile) as outfh:
            with input_file(infile) as fh:
                for record in CSVSource.read_csv(fh):
                    for (dataset, query) in queries:
                        ns = Namespace(dataset)
                        if query.source.check_filters(record):  # type: ignore
                            entities = query.map(record)
                            for entity in entities.values():
                                if sign:
                                    entity = ns.apply(entity)
                                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
示例#4
0
    def __init__(self, query, data):
        super(CSVSource, self).__init__(query, data)
        self.urls = set()
        for url in keys_values(data, 'csv_url', 'csv_urls'):
            self.urls.add(os.path.expandvars(url))

        if not len(self.urls):
            raise InvalidMapping("No CSV URLs are specified.")
示例#5
0
    def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
        super().__init__(query, data)
        self.urls: Set[str] = set()
        for url in keys_values(data, "csv_url", "csv_urls"):
            self.urls.add(cast(str, os.path.expandvars(url)))

        if not len(self.urls):
            raise InvalidMapping("No CSV URLs are specified.")

        self.filters_set = self._parse_filters(self.filters)
        self.filters_not_set = self._parse_filters(self.filters_not)
示例#6
0
def bulk_load(queue, collection, config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    queries = keys_values(config, 'queries', 'query')
    for query in queries:
        bulk_load_query(queue, collection, hash_data(query), query)
    queue_task(collection, OP_INDEX)
    queue.remove()
示例#7
0
def run_mapping(mapping_yaml):
    config = load_config_file(mapping_yaml)
    stream = click.get_text_stream('stdout')
    try:
        for dataset, meta in config.items():
            for mapping in keys_values(meta, 'queries', 'query'):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    write_object(stream, entity)
    except BrokenPipeError:
        pass
示例#8
0
def run_mapping(outfile, mapping_yaml):
    config = load_mapping_file(mapping_yaml)
    try:
        for dataset, meta in config.items():
            for mapping in keys_values(meta, 'queries', 'query'):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
示例#9
0
def run_mapping(mapping_yaml):
    config = load_mapping_file(mapping_yaml)
    stream = click.get_text_stream('stdout')
    try:
        for dataset, meta in config.items():
            for mapping in keys_values(meta, 'queries', 'query'):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    read_entity(stream, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
示例#10
0
    def __init__(self, query, data):
        super(SQLSource, self).__init__(query, data)
        self.database_uri = os.path.expandvars(data.get("database"))
        kwargs = {}
        if self.database_uri.lower().startswith("postgres"):
            kwargs["server_side_cursors"] = True
        self.engine = create_engine(self.database_uri, poolclass=NullPool, **kwargs)
        self.meta = MetaData()
        self.meta.bind = self.engine

        tables = keys_values(data, "table", "tables")
        self.tables = [QueryTable(self, f) for f in tables]
        self.joins = ensure_list(data.get("joins"))
示例#11
0
    def __init__(self, query, data, prop):
        self.query = query
        data = deepcopy(data)
        self.data = data
        self.prop = prop
        self.name = prop.name
        self.type = prop.type

        self.refs = keys_values(data, 'column', 'columns')
        self.literals = keys_values(data, 'literal', 'literals')
        self.join = data.pop('join', None)
        self.split = data.pop('split', None)
        self.entity = data.pop('entity', None)
        self.required = data.pop('required', False)

        self.template = stringify(data.pop('template', None))
        self.replacements = {}
        if self.template is not None:
            # this is hacky, trying to generate refs from template
            for ref in self.FORMAT_PATTERN.findall(self.template):
                self.refs.append(ref)
                self.replacements['{{%s}}' % ref] = ref
示例#12
0
    def __init__(self, query, data, prop):
        self.query = query
        data = deepcopy(data)
        self.data = data
        self.prop = prop
        self.name = prop.name
        self.type = prop.type

        self.refs = keys_values(data, "column", "columns")
        self.literals = keys_values(data, "literal", "literals")
        self.join = data.pop("join", None)
        self.split = data.pop("split", None)
        self.entity = data.pop("entity", None)
        self.required = data.pop("required", False)

        self.template = sanitize_text(data.pop("template", None))
        self.replacements = {}
        if self.template is not None:
            # this is hacky, trying to generate refs from template
            for ref in self.FORMAT_PATTERN.findall(self.template):
                self.refs.append(ref)
                self.replacements["{{%s}}" % ref] = ref
示例#13
0
def mapping(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    queue_task(collection, OP_BULKLOAD, payload=data)
    return ('', 202)
示例#14
0
    def __init__(self, query: "QueryMapping", data: Dict[str, Any],
                 prop: Property) -> None:
        self.query = query
        data = deepcopy(data)
        self.prop = prop

        self.refs = cast(List[str], keys_values(data, "column", "columns"))
        self.join = cast(Optional[str], data.pop("join", None))
        self.split = cast(Optional[str], data.pop("split", None))
        self.entity = stringify(data.pop("entity", None))
        self.format = stringify(data.pop("format", None))
        self.fuzzy = as_bool(data.pop("fuzzy", False))
        self.required = as_bool(data.pop("required", False))
        self.literals = cast(List[str], keys_values(data, "literal",
                                                    "literals"))

        self.template = sanitize_text(data.pop("template", None))
        self.replacements: Dict[str, str] = {}
        if self.template is not None:
            # this is hacky, trying to generate refs from template
            for ref in self.FORMAT_PATTERN.findall(self.template):
                self.refs.append(ref)
                self.replacements["{{%s}}" % ref] = ref
示例#15
0
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    from aleph.logic.collections import create_collection
    for foreign_id, data in config.items():
        data['foreign_id'] = foreign_id
        data['label'] = data.get('label', foreign_id)
        collection = create_collection(data)
        collection_id = collection.get('id')
        # FIXME: this does not perform collection metadata validation.
        for query in keys_values(data, 'queries', 'query'):
            bulk_load_query.apply_async([collection_id, query], priority=6)
示例#16
0
def run_mapping(outfile, mapping_yaml, sign=True):
    config = load_mapping_file(mapping_yaml)
    try:
        for dataset, meta in config.items():
            ns = Namespace(dataset)
            for mapping in keys_values(meta, "queries", "query"):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    if sign:
                        entity = ns.apply(entity)
                    write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
示例#17
0
文件: bulk.py 项目: pudo/aleph
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    from aleph.logic.collections import create_collection
    for foreign_id, data in config.items():
        data['foreign_id'] = foreign_id
        data['label'] = data.get('label', foreign_id)
        collection = create_collection(data)
        collection_id = collection.get('id')
        # FIXME: this does not perform collection metadata validation.
        for query in keys_values(data, 'queries', 'query'):
            bulk_load_query.apply_async([collection_id, query], priority=6)
示例#18
0
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None:
    config = load_mapping_file(mapping_yaml)
    try:
        with path_writer(outfile) as outfh:
            for dataset, meta in config.items():
                ns = Namespace(dataset)
                for mapping in keys_values(meta, "queries", "query"):
                    entities = model.map_entities(mapping, key_prefix=dataset)
                    for entity in entities:
                        if sign:
                            entity = ns.apply(entity)
                        write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
示例#19
0
def mapping_process(id):
    collection = get_db_collection(id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    # TODO: we need to look into possible abuse of mapping load path for local
    # path access on the machine running the mapping. Until then, this action
    # must be restricted to admins:
    require(request.authz.is_admin)
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
            bulk_load_query.apply_async([collection.id, query], priority=6)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    return ('', 204)
示例#20
0
def mapping_process(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    # TODO: we need to look into possible abuse of mapping load path for local
    # path access on the machine running the mapping. Until then, this action
    # must be restricted to admins:
    require(request.authz.is_admin)
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
            bulk_load_query.apply_async([collection.id, query], priority=6)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    return ('', 204)
示例#21
0
    def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
        super(SQLSource, self).__init__(query, data)
        database = data.get("database")
        if database is None:
            raise InvalidMapping("No database in SQL mapping!")
        self.database_uri = cast(str, os.path.expandvars(database))
        kwargs = {}
        if self.database_uri.lower().startswith("postgres"):
            kwargs["server_side_cursors"] = True
        self.engine = create_engine(self.database_uri,
                                    poolclass=NullPool,
                                    **kwargs)  # type: ignore
        self.meta = MetaData()
        self.meta.bind = self.engine

        tables = keys_values(data, "table", "tables")
        self.tables = [QueryTable(self.meta, f) for f in tables]
        self.joins = cast(List[Dict[str, str]], ensure_list(data.get("joins")))
示例#22
0
def stream_mapping(infile, outfile, mapping_yaml):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        for record in StreamSource.read_csv(infile):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
示例#23
0
    def __init__(
        self,
        model: "Model",
        query: "QueryMapping",
        name: str,
        data: Dict[str, Any],
        key_prefix: Optional[str] = None,
    ) -> None:
        self.model = model
        self.name = name

        self.seed = sha1(key_bytes(key_prefix))
        self.seed.update(key_bytes(data.get("key_literal")))

        self.keys = keys_values(data, "key", "keys")
        self.id_column = stringify(data.get("id_column"))
        if not len(self.keys) and self.id_column is None:
            raise InvalidMapping("No keys or ID: %r" % name)
        if len(self.keys) and self.id_column is not None:
            msg = "Please use only keys or id_column, not both: %r" % name
            raise InvalidMapping(msg)

        schema_name = stringify(data.get("schema"))
        if schema_name is None:
            raise InvalidMapping("No schema: %s" % name)
        schema = model.get(schema_name)
        if schema is None:
            raise InvalidMapping("Invalid schema: %s" % schema_name)
        self.schema = schema

        self.refs = set(self.keys)
        if self.id_column:
            self.refs.add(self.id_column)
        self.dependencies: Set[str] = set()
        self.properties: List[PropertyMapping] = []
        for name, prop_mapping in data.get("properties", {}).items():
            prop = self.schema.get(name)
            if prop is None:
                raise InvalidMapping("Invalid property: %s" % name)
            mapping = PropertyMapping(query, prop_mapping, prop)
            self.properties.append(mapping)
            self.refs.update(mapping.refs)
            if mapping.entity:
                self.dependencies.add(mapping.entity)
示例#24
0
def stream_mapping(infile, outfile, mapping_yaml, sign=True):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append((dataset, source))

    try:
        for record in StreamSource.read_csv(infile):
            for (dataset, source) in sources:
                ns = Namespace(dataset)
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        if sign:
                            entity = ns.apply(entity)
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
示例#25
0
def stream_mapping(mapping_yaml):
    stdin = click.get_text_stream('stdin')
    stdout = click.get_text_stream('stdout')

    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        for record in StreamSource.read_csv(stdin):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        read_entity(stdout, entity)
    except BrokenPipeError:
        raise click.Abort()