def __init__(self, model, query, name, data, key_prefix=None): self.model = model self.name = name self.data = data self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get("key_literal"))) self.keys = keys_values(data, "key", "keys") self.id_column = data.get("id_column") if not len(self.keys) and self.id_column is None: raise InvalidMapping("No keys or ID: %r" % name) if len(self.keys) and self.id_column is not None: msg = "Please use only keys or id_column, not both: %r" % name raise InvalidMapping(msg) self.schema = model.get(data.get("schema")) if self.schema is None: raise InvalidMapping("Invalid schema: %s" % data.get("schema")) self.refs = set(self.keys) if self.id_column: self.refs.add(self.id_column) self.dependencies = set() self.properties = [] for name, mapping in data.get("properties", {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def __init__(self, model, query, name, data, key_prefix=None): self.model = model self.name = name self.data = data self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get('key_literal'))) self.keys = ensure_list(data.get('key')) self.keys.extend(ensure_list(data.get('keys'))) if not len(self.keys): raise InvalidMapping("No keys: %r" % name) self.schema = model.get(data.get('schema')) if self.schema is None: raise InvalidMapping("Invalid schema: %s" % data.get('schema')) self.refs = set(self.keys) self.dependencies = set() self.properties = [] for name, prop in data.get('properties', {}).items(): prop_schema = self.schema.get(name) if prop_schema is None: raise InvalidMapping("Invalid property: %s" % name) prop = PropertyMapping(query, prop, prop_schema) self.properties.append(prop) self.refs.update(prop.refs) if prop.entity: self.dependencies.add(prop.entity)
def get_source(mapping): """Select the appropriate mapper to execute the given mapping.""" if 'database' in mapping.data: return SQLSource(mapping, mapping.data) elif 'csv_url' in mapping.data or 'csv_urls' in mapping.data: return CSVSource(mapping, mapping.data) raise InvalidMapping("Cannot determine mapping type")
def __init__(self, query, data): super(CSVSource, self).__init__(query, data) self.urls = set() for url in keys_values(data, 'csv_url', 'csv_urls'): self.urls.add(os.path.expandvars(url)) if not len(self.urls): raise InvalidMapping("No CSV URLs are specified.")
def bind(self): if self.entity is None: return # Figure out if the schema types of the referenced entities # are of a type compatible with the range of this property. # For example, an asset can be owned by a legal entity, but # by a bank payment or a ship. for entity in self.query.entities: if entity.name != self.entity: continue if not entity.schema.is_a(self.schema.range): raise InvalidMapping("The entity [%s] must be a %s (not %s)" % (self.name, self.schema.range, entity.schema.name)) # noqa return raise InvalidMapping("No entity [%s] for property [%s]" % (self.entity, self.name))
def __init__( self, model: "Model", query: "QueryMapping", name: str, data: Dict[str, Any], key_prefix: Optional[str] = None, ) -> None: self.model = model self.name = name self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get("key_literal"))) self.keys = keys_values(data, "key", "keys") self.id_column = stringify(data.get("id_column")) if not len(self.keys) and self.id_column is None: raise InvalidMapping("No keys or ID: %r" % name) if len(self.keys) and self.id_column is not None: msg = "Please use only keys or id_column, not both: %r" % name raise InvalidMapping(msg) schema_name = stringify(data.get("schema")) if schema_name is None: raise InvalidMapping("No schema: %s" % name) schema = model.get(schema_name) if schema is None: raise InvalidMapping("Invalid schema: %s" % schema_name) self.schema = schema self.refs = set(self.keys) if self.id_column: self.refs.add(self.id_column) self.dependencies: Set[str] = set() self.properties: List[PropertyMapping] = [] for name, prop_mapping in data.get("properties", {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, prop_mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def __init__(self, query, data): super(CSVSource, self).__init__(query, data) urls = ensure_list(data.get('csv_url')) urls.extend(ensure_list(data.get('csv_urls'))) self.urls = set() for url in urls: self.urls.add(os.path.expandvars(url)) if not len(self.urls): raise InvalidMapping("No CSV URLs are specified.")
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None: super().__init__(query, data) self.urls: Set[str] = set() for url in keys_values(data, "csv_url", "csv_urls"): self.urls.add(cast(str, os.path.expandvars(url))) if not len(self.urls): raise InvalidMapping("No CSV URLs are specified.") self.filters_set = self._parse_filters(self.filters) self.filters_not_set = self._parse_filters(self.filters_not)
def __init__(self, model, data, key_prefix=None): self.model = model self.data = data self.refs = set() self.entities = [] for name, data in data.get('entities', {}).items(): entity = EntityMapping(model, self, name, data, key_prefix=key_prefix) self.entities.append(entity) self.refs.update(entity.refs) if not len(self.entities): raise InvalidMapping("No entity mappings are defined.") # Check if the provided links satisfy the ranges of the given # properties (e.g. the owner of a company must be a legal person) for entity in self.entities: entity.bind() # Do dependency resolution, i.e. find the right order to # map these entities. This is needed to resolve entity IDs # in dependent entities. entities = self.entities self.entities = [] resolved = set() while len(entities) > 0: before = len(entities) for entity in entities: if entity.dependencies.issubset(resolved): self.entities.append(entity) entities.remove(entity) resolved.add(entity.name) break if before == len(entities): raise InvalidMapping("Circular entity dependency detected.")
def read_csv_url(self, url): parsed_url = requests.utils.urlparse(url) log.info("Loading: %s", url) if parsed_url.scheme in ['http', 'https']: res = requests.get(url, stream=True) if not res.ok: raise InvalidMapping("Failed to open CSV: %s" % url) # if res.encoding is None: res.encoding = 'utf-8' # log.info("Detected encoding: %s", res.encoding) lines = res.iter_lines(decode_unicode=True) yield from self.read_csv(lines) else: with io.open(parsed_url.path, 'r') as fh: yield from self.read_csv(fh)
def read_csv_url(self, url: str) -> Generator[Record, None, None]: parsed_url = urlparse(url) log.info("Loading: %s", url) if parsed_url.scheme in ["http", "https"]: res = requests.get(url, stream=True) if not res.ok: raise InvalidMapping("Failed to open CSV: %s" % url) # if res.encoding is None: res.encoding = "utf-8" # log.info("Detected encoding: %s", res.encoding) lines = res.iter_lines(decode_unicode=True) yield from self.read_csv(lines) else: with io.open(parsed_url.path, "r") as fh: yield from self.read_csv(fh)
def __init__(self, meta: MetaData, data: Union[str, Dict[str, str]]) -> None: if isinstance(data, str): data = {"table": data} table_ref = data.get("table") if table_ref is None: raise InvalidMapping("Query has no table!") alias_ref = data.get("alias", table_ref) self.table = Table(table_ref, meta, autoload=True) self.alias = self.table.alias(alias_ref) self.refs: Dict[str, Label[Any]] = {} for column in self.alias.columns: name = "%s.%s" % (alias_ref, column.name) labeled_column = column.label("col_%s" % uuid4().hex[:10]) self.refs[name] = labeled_column self.refs[column.name] = labeled_column
def read_csv(self, url): parsed_url = requests.utils.urlparse(url) log.info("Loading: %s", url) if parsed_url.scheme in ['http', 'https']: res = requests.get(url, stream=True) if not res.ok: raise InvalidMapping("Failed to open CSV: %s" % url) # if res.encoding is None: res.encoding = 'utf-8' # log.info("Detected encoding: %s", res.encoding) lines = res.iter_lines(decode_unicode=True) for row in DictReader(lines, skipinitialspace=True): yield row else: with io.open(parsed_url.path, 'r') as fh: for row in DictReader(fh, skipinitialspace=True): yield row
def read_csv_url(self, url): parsed_url = requests.utils.urlparse(url) log.info("Loading: %s", url) if parsed_url.scheme in ['http', 'https']: res = requests.get(url, stream=True) if not res.ok: raise InvalidMapping("Failed to open CSV: %s" % url) # if res.encoding is None: res.encoding = 'utf-8' # log.info("Detected encoding: %s", res.encoding) lines = res.iter_lines(decode_unicode=True) yield from self.read_csv(lines) else: # XXX: This is a security issue in conjunction with the # aleph mapping API because a user could map any file # from the server file system. Remove??? with io.open(parsed_url.path, 'r') as fh: yield from self.read_csv(fh)
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None: super(SQLSource, self).__init__(query, data) database = data.get("database") if database is None: raise InvalidMapping("No database in SQL mapping!") self.database_uri = cast(str, os.path.expandvars(database)) kwargs = {} if self.database_uri.lower().startswith("postgres"): kwargs["server_side_cursors"] = True self.engine = create_engine(self.database_uri, poolclass=NullPool, **kwargs) # type: ignore self.meta = MetaData() self.meta.bind = self.engine tables = keys_values(data, "table", "tables") self.tables = [QueryTable(self.meta, f) for f in tables] self.joins = cast(List[Dict[str, str]], ensure_list(data.get("joins")))
def source(self): if 'database' in self.data: return SQLSource(self, self.data) elif 'csv_url' in self.data or 'csv_urls' in self.data: return CSVSource(self, self.data) raise InvalidMapping("Cannot determine mapping type")
def _get_source(self, data: Dict[str, Any]) -> Source: if "database" in data: return SQLSource(self, data) if "csv_url" in data or "csv_urls" in data: return CSVSource(self, data) raise InvalidMapping("Cannot determine mapping type: %r" % data)
def get_column(self, ref: Optional[str]) -> Label[Any]: for table in self.tables: if ref in table.refs: return table.refs[ref] raise InvalidMapping("Missing reference: %s" % ref)
def source(self): if "database" in self.data: return SQLSource(self, self.data) elif "csv_url" in self.data or "csv_urls" in self.data: return CSVSource(self, self.data) raise InvalidMapping("Cannot determine mapping type")
def get_column(self, ref): for table in self.tables: if ref in table.refs: return table.refs.get(ref) raise InvalidMapping("Missing reference: %s" % ref)