Пример #1
0
def wrangle_columns(df, config):
    use_columns = []
    rename_columns = {}
    map_funcs = {}
    for column in config.columns:
        if isinstance(column, str):
            use_columns.append(column)
        elif banal.is_mapping(column):
            if len(column) > 1:
                raise ConfigError(f'Column config `{column}` has errors.')
            target, source = list(column.items())[0]
            if banal.is_mapping(source):
                source_column = source.get('column', target)
                map_func = source.get('map')
                if map_func:
                    map_funcs[target] = safe_eval(map_func)
            else:
                source_column = source
            use_columns.append(source_column)
            rename_columns[source_column] = target
        else:
            raise ConfigError(f'Column config `{column}` has errors.')

    df = df[use_columns]
    if rename_columns:
        df = df.rename(columns=rename_columns)
    if map_funcs:
        for col, func in map_funcs.items():
            df[col] = df[col].map(func)
    return df
Пример #2
0
def flatten_id(data, field, nested):
    if not is_mapping(data):
        return data
    value = stringify(data.get(field))
    if value is None:
        nested = data.get(nested)
        if is_mapping(nested):
            value = stringify(nested.get('id'))
    data[field] = value
Пример #3
0
def read_object(stream):
    line = stream.readline()
    if not line:
        return
    data = json.loads(line)
    if is_mapping(data) and 'schema' in data:
        return model.get_proxy(data)
    if is_mapping(data) and 'enricher' in data:
        enricher = load_enricher(data.get('enricher'))
        return Result.from_dict(enricher, data)
    return data
Пример #4
0
def save_issue(conn: Conn, event: Dict[str, Any]) -> None:
    data = dict(event)
    for key, value in data.items():
        if hasattr(value, "to_dict"):
            value = value.to_dict()
        if isinstance(value, set):
            value = list(value)
        data[key] = value

    data.pop("_record", None)
    data.pop("timestamp", None)
    record = {
        "timestamp": settings.RUN_TIME,
        "module": data.pop("logger", None),
        "level": data.pop("level"),
        "message": data.pop("event", None),
        "dataset": data.pop("dataset"),
    }
    entity = data.pop("entity", None)
    if is_mapping(entity):
        record["entity_id"] = entity.get("id")
        record["entity_schema"] = entity.get("schema")
    elif isinstance(entity, str):
        record["entity_id"] = entity
    record["data"] = data
    q = issue_table.insert().values([record])
    conn.execute(q)
    return None
Пример #5
0
def to_jsonschema(obj):
    """Schema are stored in OpenAPI spec and might need some massaging
    to make for valid JSON Schema."""
    if is_mapping(obj):
        # Re-write nullable fields:
        type_ = obj.get("type")

        if obj.get("nullable", False):
            type_ = obj.pop("type", None)
            format_ = obj.pop("format", None)
            obj["oneOf"] = [
                {
                    "type": "null"
                },
                {
                    "type": type_,
                    "format": format_
                },
            ]

        obj.pop("nullable", None)
        out = {}
        for key, value in obj.items():
            out[key] = to_jsonschema(value)
        return out
    if is_listish(obj):
        return [to_jsonschema(o) for o in obj]
    return obj
Пример #6
0
 def request(self,
             method,
             url,
             headers={},
             auth=None,
             data=None,
             params=None,
             json=None,
             allow_redirects=True,
             lazy=False):
     if is_mapping(params):
         params = list(params.items())
     url = normalize_url(url, extra_query_args=params)
     method = method.upper().strip()
     request = Request(method,
                       url,
                       data=data,
                       headers=headers,
                       json=json,
                       auth=auth)
     response = ContextHttpResponse(self,
                                    request=request,
                                    allow_redirects=allow_redirects)
     if not lazy:
         response.fetch()
     return response
Пример #7
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Пример #8
0
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
Пример #9
0
def apply_ops(df, ops):
    """apply any valid operation from `pd.DataFrame.<op>` with optional arguments in given order"""
    for op in ops:
        op_name = op
        op_args = None
        if banal.is_mapping(op):
            name = list(op.keys())
            if len(name) > 1:
                raise ConfigError(
                    f'Operation not valid: {name} - should be only 1 item.')
            op_name = name[0]
            op_args = list(op.values())
            if len(op_args) > 1:
                raise ConfigError(
                    f'Operation arguments not valid: {op_args} - should be only 1 mapping item.'
                )
            op_args = {
                k: safe_eval(v) if k == 'func' else v
                for k, v in op_args[0].items()
            }
        func = getattr(DataFrame, op_name, None)
        if func is None or not callable(func):
            raise ConfigError(
                f'{op} is not a valid opration for `pd.DataFrame`')
        if op_args:
            df = func(df, **op_args)
        else:
            df = func(df)
    return df
Пример #10
0
 def convert_entity(self, result, data):
     data = ensure_dict(data)
     if 'properties' not in data or 'schema' not in data:
         return
     try:
         entity = result.make_entity(data.get('schema'))
     except InvalidData:
         log.error("Server model mismatch: %s" % data.get('schema'))
         return
     entity.id = data.get('id')
     links = ensure_dict(data.get('links'))
     entity.add('alephUrl', links.get('self'))
     properties = ensure_dict(data.get('properties'))
     for prop, values in properties.items():
         for value in ensure_list(values):
             if is_mapping(value):
                 child = self.convert_entity(result, value)
                 if child.id is None:
                     continue
                 value = child.id
             try:
                 entity.add(prop, value, cleaned=True)
             except InvalidData:
                 msg = "Server property mismatch (%s): %s"
                 log.warning(msg % (entity.schema.name, prop))
     result.add_entity(entity)
     return entity
Пример #11
0
def filter_text(spec, invert=False):
    """Try to convert a given filter to a lucene query string."""
    # CAVEAT: This doesn't cover all filters used by aleph.
    if isinstance(spec, (list, tuple, set)):
        parts = [filter_text(s, invert=invert) for s in spec]
        return " ".join(parts)
    if not is_mapping(spec):
        return spec
    for op, props in spec.items():
        if op == "term":
            field, value = next(iter(props.items()))
            field = "-%s" % field if invert else field
            return '%s:"%s"' % (field, value)
        if op == "terms":
            field, values = next(iter(props.items()))
            parts = [{"term": {field: v}} for v in values]
            parts = [filter_text(p, invert=invert) for p in parts]
            predicate = " AND " if invert else " OR "
            text = predicate.join(parts)
            if len(parts) > 1:
                text = "(%s)" % text
            return text
        if op == "exists":
            field = props.get("field")
            field = "-%s" % field if invert else field
            return "%s:*" % field
Пример #12
0
def get_entity_id(obj):
    """Given an entity-ish object, try to get the ID."""
    if is_mapping(obj):
        obj = obj.get('id')
    elif hasattr(obj, 'id'):
        obj = obj.id
    return sanitize_text(obj)
Пример #13
0
def bulk_write(collection, items):
    """Write a set of entities - given as raw dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data")

        entity = model.get_proxy(item)
        if entity.id is None:
            raise InvalidData("No ID for entity")

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities)
Пример #14
0
    def request(
        self,
        method,
        url,
        headers={},
        auth=None,
        data=None,
        params=None,
        json=None,
        allow_redirects=True,
        timeout=settings.HTTP_TIMEOUT,
        lazy=False,
    ):
        if is_mapping(params):
            params = list(params.items())

        method = method.upper().strip()
        request = Request(method,
                          url,
                          data=data,
                          headers=headers,
                          json=json,
                          auth=auth,
                          params=params)
        response = ContextHttpResponse(self,
                                       request=request,
                                       allow_redirects=allow_redirects,
                                       timeout=timeout)
        if not lazy:
            response.fetch()
        return response
Пример #15
0
def read_entity(stream):
    line = stream.readline()
    if not line:
        return
    data = json.loads(line)
    if is_mapping(data) and 'schema' in data:
        return model.get_proxy(data)
    return data
Пример #16
0
 def convert_nested(self, data):
     entity = self.convert_entity(data)
     properties = ensure_dict(data.get("properties"))
     for prop, values in properties.items():
         for value in ensure_list(values):
             if is_mapping(value):
                 yield self.convert_entity(value)
     yield entity
Пример #17
0
def convert_classification(entity, item, prop='classification'):
    if not is_mapping(item):
        entity.add(prop, item)
    else:
        if 'classification' in item:
            convert_classification(entity, item.get('classification'), prop)
        convert_description(entity, item, prop)
        convert_address(entity, item.pop('deliveryAddress', {}))
Пример #18
0
def get_entity_id(obj):
    """Given an entity-ish object, try to get the ID."""
    if isinstance(obj, str):
        return obj
    if is_mapping(obj):
        return obj.get('id')
    if hasattr(obj, 'id'):
        return obj.id
Пример #19
0
 def _generate():
     for data in entities:
         if not is_mapping(data):
             raise InvalidData("Failed to read input data", errors=data)
         entity = model.get_proxy(data)
         if not unsafe:
             entity = remove_checksums(entity)
         yield _process_entity(entity)
Пример #20
0
def merge_data(old, new):
    """Extend the values of the new doc with extra values from the old."""
    if is_sequence(old) or is_sequence(new):
        new = ensure_list(new)
        new.extend(ensure_list(old))
        return unique_list(new)
    if is_mapping(old) or is_mapping(new):
        old = old if is_mapping(old) else {}
        new = new if is_mapping(new) else {}
        keys = set(new.keys())
        keys.update(old.keys())
        combined = {}
        for key in keys:
            value = merge_data(old.get(key), new.get(key))
            if value is not None:
                combined[key] = value
        return combined
    return new or old
Пример #21
0
def object_id(obj, clazz=None):
    """Turn a given object into an ID that can be stored in with
    the notification."""
    clazz = clazz or type(obj)
    if isinstance(obj, clazz):
        obj = obj.id
    elif is_mapping(obj):
        obj = obj.get('id')
    return obj
Пример #22
0
def read_result(stream):
    line = stream.readline()
    if not line:
        return
    data = json.loads(line)
    if is_mapping(data) and 'enricher' in data:
        enricher = load_enricher(data.get('enricher'))
        return Result.from_dict(enricher, data)
    return data
Пример #23
0
def refresh_entity(entity, sync=False):
    if is_mapping(entity):
        entity_id = entity.get('id')
        collection_id = entity.get('collection_id')
    else:
        entity_id = entity.id
        collection_id = entity.collection_id
    cache.kv.delete(cache.object_key(Entity, entity_id),
                    cache.object_key(Collection, collection_id))
Пример #24
0
def refresh_entity(entity, sync=False):
    if is_mapping(entity):
        entity_id = entity.get('id')
        collection_id = entity.get('collection_id')
    else:
        entity_id = entity.id
        collection_id = entity.collection_id
    cache.kv.delete(cache.object_key(Entity, entity_id),
                    cache.object_key(Collection, collection_id))
Пример #25
0
def get_entity_id(obj):
    """Given an entity-ish object, try to get the ID."""
    if is_mapping(obj):
        obj = obj.get("id")
    else:
        try:
            obj = obj.id
        except AttributeError:
            pass
    return obj
Пример #26
0
def get_entity_id(obj: Any) -> Optional[str]:
    """Given an entity-ish object, try to get the ID."""
    if is_mapping(obj):
        obj = obj.get("id")
    else:
        try:
            obj = obj.id
        except AttributeError:
            pass
    return stringify(obj)
Пример #27
0
 def ref(self, value):
     """Generate a qualified form for storage in a triplestore."""
     if self.prefix is None:
         return
     if is_mapping(value):
         value = value.get('id')
     value = stringify(value)
     if value is None:
         return
     return ':'.join((self.prefix, value))
Пример #28
0
    def __init__(self, schema, id, properties, key_prefix=None):
        self.schema = schema
        self.id = stringify(id)
        self.key_prefix = stringify(key_prefix)
        self.countries = set()
        self.names = set()
        self._properties = {}

        if is_mapping(properties):
            for key, value in properties.items():
                self.add(key, value, cleaned=True, quiet=True)
Пример #29
0
    def generate(self):
        self.model.properties.add(self)

        if self.range is None and self.type == registry.entity:
            self.range = self.model.get(self.data.get('range'))

        reverse_ = self.data.get('reverse')
        if self.reverse is None and self.range and reverse_:
            if not is_mapping(reverse_):
                raise InvalidModel("Invalid reverse: %s" % self)
            self.reverse = self.range._add_reverse(reverse_, self)
Пример #30
0
    def generate(self):
        range_ = self.data.get('schema', 'Thing')
        if range_:
            self.range = self.schema.model.get(range_)
            if self.range is None:
                raise InvalidModel("Cannot find range: %s" % self._range)

        reverse_ = self.data.get('reverse')
        if self.range and reverse_:
            if not is_mapping(reverse_):
                raise InvalidModel("Invalid reverse: %s" % self)
            self.reverse = self.range._add_reverse(reverse_, self)
Пример #31
0
def convert_identifier(entity, identifier):
    if not is_mapping(identifier):
        entity.add(DEFTAULT_IDENTIFIER, identifier)
        return
    convert_name(entity, identifier)
    scheme = identifier.pop('scheme', None)
    prop = IDENTIFIERS.get(scheme, None)
    if prop is None:
        log.info("Unknown identifier scheme: %s", scheme)
        prop = DEFTAULT_IDENTIFIER
        IDENTIFIERS[scheme] = prop
    entity.add(prop, identifier.pop('id', None))
Пример #32
0
Файл: bulk.py Проект: pudo/aleph
def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)