Пример #1
0
def filter(data, where):
    """
    where  - a function that accepts (record, rownum, rows) and returns boolean
    """
    if len(data) == 0 or where == None or where == TRUE:
        return data

    if isinstance(data, Container):
        return data.filter(where)

    if is_container(data):
        temp = jx_expression_to_function(where)
        dd = wrap(data)
        return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)])
    else:
        Log.error(
            "Do not know how to handle type {{type}}", type=data.__class__.__name__
        )

    try:
        return drill_filter(where, data)
    except Exception as _:
        # WOW!  THIS IS INEFFICIENT!
        return wrap(
            [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])]
        )
Пример #2
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Closer(connect_to_region(
                self.settings.region,
                aws_access_key_id=unwrap(self.settings.aws_access_key_id),
                aws_secret_access_key=unwrap(self.settings.aws_secret_access_key)
            )) as conn:

                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(c in params.params.error for c in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    conn.send_email(
                        source=self.settings.from_address,
                        to_addresses=listwrap(to_address),
                        subject=self.settings.subject,
                        body="\n\n".join(content),
                        format="text"
                    )

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Пример #3
0
    def __init__(self, instance_manager, disable_prices=False, kwargs=None):
        self.settings = kwargs
        self.instance_manager = instance_manager
        aws_args = dict(
            region_name=kwargs.aws.region,
            aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id),
            aws_secret_access_key=unwrap(kwargs.aws.aws_secret_access_key)
        )
        self.ec2_conn = boto.ec2.connect_to_region(**aws_args)
        self.vpc_conn = boto.vpc.connect_to_region(**aws_args)
        self.price_locker = Lock()
        self.prices = None
        self.price_lookup = None
        self.done_spot_requests = Signal()
        self.net_new_locker = Lock()
        self.net_new_spot_requests = UniqueIndex(("id",))  # SPOT REQUESTS FOR THIS SESSION
        self.watcher = None
        self.active = None

        self.settings.uptime.bid_percentile = coalesce(self.settings.uptime.bid_percentile, self.settings.bid_percentile)
        self.settings.uptime.history = coalesce(Date(self.settings.uptime.history), DAY)
        self.settings.uptime.duration = coalesce(Duration(self.settings.uptime.duration), Date("5minute"))
        self.settings.max_percent_per_type = coalesce(self.settings.max_percent_per_type, 1)

        if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required():
            self._start_life_cycle_watcher()
        if not disable_prices:
            self.pricing()
Пример #4
0
def aggs_iterator(aggs, decoders, coord=True):
    """
    DIG INTO ES'S RECURSIVE aggs DATA-STRUCTURE:
    RETURN AN ITERATOR OVER THE EFFECTIVE ROWS OF THE RESULTS

    :param aggs: ES AGGREGATE OBJECT
    :param decoders:
    :param coord: TURN ON LOCAL COORDINATE LOOKUP
    """
    depth = max(d.start + d.num_columns for d in decoders)

    def _aggs_iterator(agg, d):
        agg = drill(agg)

        if d > 0:
            for k, v in agg.items():
                if k == "_match":
                    v = drill(v)
                    for i, b in enumerate(v.get("buckets", EMPTY_LIST)):
                        b["_index"] = i
                        for a, parts in _aggs_iterator(b, d - 1):
                            yield a, parts + (b,)
                elif k == "_other":
                    for b in v.get("buckets", EMPTY_LIST):
                        for a, parts in _aggs_iterator(b, d - 1):
                            yield a, parts + (Null,)
                elif k == "_missing":
                    b = drill(v)
                    for a, parts in _aggs_iterator(b, d - 1):
                        yield a, parts + (b,)
                elif k.startswith("_join_"):
                    v["key"] = int(k[6:])
                    for a, parts in _aggs_iterator(v, d - 1):
                        yield a, parts + (v,)
        else:
            for k, v in agg.items():
                if k == "_match":
                    v = drill(v)
                    for i, b in enumerate(v.get("buckets", EMPTY_LIST)):
                        b["_index"] = i
                        yield b, (b,)
                elif k == "_other":
                    for b in v.get("buckets", EMPTY_LIST):
                        yield b, (Null,)
                elif k == "_missing":
                    b = drill(v,)
                    yield b, (v,)
                elif k.startswith("_join_"):
                    v["_index"] = int(k[6:])
                    yield v, (v,)

    if coord:
        for a, parts in _aggs_iterator(unwrap(aggs), depth - 1):
            coord = tuple(d.get_index(parts) for d in decoders)
            if any(c is None for c in coord):
                continue
            yield parts, coord, a
    else:
        for a, parts in _aggs_iterator(unwrap(aggs), depth - 1):
            yield parts, None, a
Пример #5
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "range"
        self.NULL = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            parts =listwrap(self.partitions)
            for i, p in enumerate(parts):
                self.min = MIN([self.min, p.min])
                self.max = MAX([self.max, p.max])
                if p.dataIndex != None and p.dataIndex != i:
                    Log.error("Expecting `dataIndex` to agree with the order of the parts")
                if p[self.key] == None:
                    Log.error("Expecting all parts to have {{key}} as a property", key=self.key)
                p.dataIndex = i

            # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE
            for p, q in itertools.product(parts, parts):
                if p.min <= q.min and q.min < p.max and unwrap(p) is not unwrap(q):
                    Log.error("partitions overlap!")

            self.partitions = wrap(parts)
            return
        elif any([self.min == None, self.max == None, self.interval == None]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
Пример #6
0
    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {
            v["id"]: v["value"] if "value" in v else mo_json.json2value(v['json'])
            for v in records
        }

        unwrap(self.data).update(records)
        self.refresh()
        Log.note("{{num}} documents added", num=len(records))
Пример #7
0
    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {v["id"]: v["value"] for v in records}

        unwrap(self.data).update(records)

        data_as_json = convert.value2json(self.data, pretty=True)

        File(self.filename).write(data_as_json)
        Log.note("{{num}} documents added",  num= len(records))
Пример #8
0
def _where_terms(master, where, schema):
    """
    USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
    master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
    """
    if isinstance(where, Mapping):
        if where.term:
            # MAP TERM
            try:
                output = _map_term_using_schema(master, [], where.term, schema.edges)
                return output
            except Exception as e:
                Log.error("programmer problem?", e)
        elif where.terms:
            # MAP TERM
            output = FlatList()
            for k, v in where.terms.items():
                if not isinstance(v, (list, set)):
                    Log.error("terms filter expects list of values")
                edge = schema.edges[k]
                if not edge:
                    output.append({"terms": {k: v}})
                else:
                    if isinstance(edge, text_type):
                        # DIRECT FIELD REFERENCE
                        return {"terms": {edge: v}}
                    try:
                        domain = edge.getDomain()
                    except Exception as e:
                        Log.error("programmer error", e)
                    fields = domain.dimension.fields
                    if isinstance(fields, Mapping):
                        or_agg = []
                        for vv in v:
                            and_agg = []
                            for local_field, es_field in fields.items():
                                vvv = vv[local_field]
                                if vvv != None:
                                    and_agg.append({"term": {es_field: vvv}})
                            or_agg.append({"and": and_agg})
                        output.append({"or": or_agg})
                    elif isinstance(fields, list) and len(fields) == 1 and is_variable_name(fields[0]):
                        output.append({"terms": {fields[0]: v}})
                    elif domain.partitions:
                        output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]})
            return {"and": output}
        elif where["or"]:
            return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]}
        elif where["and"]:
            return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]}
        elif where["not"]:
            return {"not": unwrap(_where_terms(master, where["not"], schema))}
    return where
Пример #9
0
def es_fieldop(es, query):
    FromES = es09.util.build_es_query(query)
    select = listwrap(query.select)
    FromES.query = {
        "filtered": {
            "query": {
                "match_all": {}
            },
            "filter": simplify_esfilter(jx_expression(query.where).to_esfilter())
        }
    }
    FromES.size = coalesce(query.limit, 200000)
    FromES.fields = FlatList()
    for s in select.value:
        if s == "*":
            FromES.fields = None
        elif isinstance(s, list):
            FromES.fields.extend(s)
        elif isinstance(s, Mapping):
            FromES.fields.extend(s.values())
        else:
            FromES.fields.append(s)
    FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort]

    data = es09.util.post(es, FromES, query.limit)

    T = data.hits.hits
    matricies = {}
    for s in select:
        if s.value == "*":
            matricies[s.name] = Matrix.wrap([t._source for t in T])
        elif isinstance(s.value, Mapping):
            # for k, v in s.value.items():
            #     matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T])
            matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T])
        elif isinstance(s.value, list):
            matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T])
        elif not s.value:
            matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T])
        else:
            try:
                matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T])
            except Exception as e:
                Log.error("", e)

    cube = Cube(query.select, query.edges, matricies, frum=query)
    cube.frum = query
    return cube
Пример #10
0
def make_log_from_settings(settings):
    assert settings["class"]

    # IMPORT MODULE FOR HANDLER
    path = settings["class"].split(".")
    class_name = path[-1]
    path = ".".join(path[:-1])
    constructor = None
    try:
        temp = __import__(path, globals(), locals(), [class_name], 0)
        constructor = object.__getattribute__(temp, class_name)
    except Exception as e:
        if settings.stream and not constructor:
            # PROVIDE A DEFAULT STREAM HANLDER
            constructor = StructuredLogger_usingThreadedStream
        else:
            Log.error("Can not find class {{class}}",  {"class": path}, cause=e)

    # IF WE NEED A FILE, MAKE SURE DIRECTORY EXISTS
    if settings.filename != None:
        from mo_files import File

        f = File(settings.filename)
        if not f.parent.exists:
            f.parent.create()

    settings['class'] = None
    params = unwrap(settings)
    log_instance = constructor(**params)
    return log_instance
Пример #11
0
    def __setitem__(self, key, value):
        if key == "":
            get_logger().error("key is empty string.  Probably a bad idea")
        if isinstance(key, str):
            key = key.decode("utf8")
        d=self
        try:
            value = unwrap(value)
            if key.find(".") == -1:
                if value is None:
                    dict.pop(d, key, None)
                else:
                    dict.__setitem__(d, key, value)
                return self

            seq = _split_field(key)
            for k in seq[:-1]:
                d = _getdefault(d, k)
            if value == None:
                dict.pop(d, seq[-1], None)
            else:
                dict.__setitem__(d, seq[-1], value)
            return self
        except Exception as e:
            raise e
Пример #12
0
def tuple(data, field_name):
    """
    RETURN LIST  OF TUPLES
    """
    if isinstance(data, Cube):
        Log.error("not supported yet")

    if isinstance(data, FlatList):
        Log.error("not supported yet")

    if is_data(field_name) and "value" in field_name:
        # SIMPLIFY {"value":value} AS STRING
        field_name = field_name["value"]

    # SIMPLE PYTHON ITERABLE ASSUMED
    if is_text(field_name):
        if len(split_field(field_name)) == 1:
            return [(d[field_name],) for d in data]
        else:
            path = split_field(field_name)
            output = []
            flat_list._tuple1(data, path, 0, output)
            return output
    elif is_list(field_name):
        paths = [_select_a_field(f) for f in field_name]
        output = FlatList()
        _tuple((), unwrap(data), paths, 0, output)
        return output
    else:
        paths = [_select_a_field(field_name)]
        output = FlatList()
        _tuple((), data, paths, 0, output)
        return output
Пример #13
0
def dict2Multiset(dic):
    if dic == None:
        return None
    from mo_collections.multiset import Multiset
    output = Multiset()
    output.dic = unwrap(dic).copy()
    return output
Пример #14
0
    def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1):
        self.name = name
        self.service_stopped = Signal("stopped signal for " + strings.quote(name))
        self.stdin = Queue("stdin for process " + strings.quote(name), silent=True)
        self.stdout = Queue("stdout for process " + strings.quote(name), silent=True)
        self.stderr = Queue("stderr for process " + strings.quote(name), silent=True)

        try:
            self.debug = debug or DEBUG
            self.service = service = subprocess.Popen(
                params,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                bufsize=bufsize,
                cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath,
                env=unwrap(set_default(env, os.environ)),
                shell=shell
            )

            self.please_stop = Signal()
            self.please_stop.on_go(self._kill)
            self.thread_locker = Lock()
            self.children = [
                Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " waiter", self._monitor, parent_thread=self),
            ]
        except Exception as e:
            Log.error("Can not call", e)

        if self.debug:
            Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
Пример #15
0
    def unexpected(
        cls,
        template,
        default_params={},
        cause=None,
        stack_depth=0,
        log_context=None,
        **more_params
    ):
        """
        :param template: *string* human readable string with placeholders for parameters
        :param default_params: *dict* parameters to fill in template
        :param cause: *Exception* for chaining
        :param stack_depth:  *int* how many calls you want popped off the stack to report the *true* caller
        :param log_context: *dict* extra key:value pairs for your convenience
        :param more_params: *any more parameters (which will overwrite default_params)
        :return:
        """
        if isinstance(default_params, BaseException):
            cause = default_params
            default_params = {}

        params = dict(unwrap(default_params), **more_params)

        if cause and not isinstance(cause, Except):
            cause = Except(exceptions.UNEXPECTED, text_type(cause), trace=exceptions._extract_traceback(0))

        trace = exceptions.extract_stack(1)
        e = Except(type=exceptions.UNEXPECTED, template=template, params=params, cause=cause, trace=trace)
        Log.note(
            "{{error}}",
            error=e,
            log_context=set_default({"context": exceptions.WARNING}, log_context),
            stack_depth=stack_depth + 1
        )
Пример #16
0
        def post(sql):
            # FIND OUT THE default DOMAIN SIZES
            result = self.db.column_query(sql)
            num_edges = len(edges)
            for e, edge in enumerate(edges):
                domain = edge.domain
                if domain.type == "default":
                    domain.type = "set"
                    parts = set(result[e])
                    domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)]
                    domain.map = {p: i for i, p in enumerate(parts)}
                else:
                    Log.error("Do not know what to do here, yet")

            # FILL THE DATA CUBE
            maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)]
            cubes = FlatList()
            for c, s in enumerate(select):
                data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges])
                for rownum, value in enumerate(result[c + num_edges]):
                    coord = [m[r[rownum]] for m, r in maps]
                    data[coord] = value
                cubes.append(data)

            if isinstance(query.select, list):
                return cubes
            else:
                return cubes[0]
Пример #17
0
    def get(self, key):
        """
        simple `select`
        """
        if not Log:
            _late_import()

        return FlatList(vals=[unwrap(coalesce(_datawrap(v), Null)[key]) for v in _get_list(self)])
Пример #18
0
 def __setattr__(self, key, value):
     d = self._internal_dict
     value = unwrap(value)
     if value is None:
         d = self._internal_dict
         d.pop(key, None)
     else:
         d[key] = value
     return self
Пример #19
0
 def call(self, proc_name, params):
     self._execute_backlog()
     params = [unwrap(v) for v in params]
     try:
         self.cursor.callproc(proc_name, params)
         self.cursor.close()
         self.cursor = self.db.cursor()
     except Exception as e:
         Log.error("Problem calling procedure " + proc_name, e)
Пример #20
0
def _replace_ref(node, url):
    if url.path.endswith("/"):
        url.path = url.path[:-1]

    if isinstance(node, Mapping):
        ref = None
        output = {}
        for k, v in node.items():
            if k == "$ref":
                ref = URL(v)
            else:
                output[k] = _replace_ref(v, url)

        if not ref:
            return output

        node = output

        if not ref.scheme and not ref.path:
            # DO NOT TOUCH LOCAL REF YET
            output["$ref"] = ref
            return output

        if not ref.scheme:
            # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH
            # REQUIRES THE CURRENT DOCUMENT'S SCHEME
            ref.scheme = url.scheme

        # FIND THE SCHEME AND LOAD IT
        if ref.scheme in scheme_loaders:
            new_value = scheme_loaders[ref.scheme](ref, url)
        else:
            raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme)

        if ref.fragment:
            new_value = mo_dots.get_attr(new_value, ref.fragment)

        DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value)

        if not output:
            output = new_value
        elif isinstance(output, text_type):
            Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value)
        else:
            output = unwrap(set_default(output, new_value))

        DEBUG and Log.note("Return {{output}}", output=output)

        return output
    elif isinstance(node, list):
        output = [_replace_ref(n, url) for n in node]
        # if all(p[0] is p[1] for p in zip(output, node)):
        #     return node
        return output

    return node
Пример #21
0
def reverse(vals):
    # TODO: Test how to do this fastest
    l = len(vals)
    output = [None] * l

    for v in unwrap(vals):
        l -= 1
        output[l] = v

    return wrap(output)
Пример #22
0
def argparse(defs):
    parser = _argparse.ArgumentParser()
    for d in listwrap(defs):
        args = d.copy()
        name = args.name
        args.name = None
        parser.add_argument(*unwrap(listwrap(name)), **args)
    namespace = parser.parse_args()
    output = {k: getattr(namespace, k) for k in vars(namespace)}
    return wrap(output)
Пример #23
0
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None):
    show_detail = True
    test = unwrap(test)
    expected = unwrap(expected)
    try:
        if test is None and expected is None:
            return
        elif isinstance(test, UniqueIndex):
            if test ^ expected:
                Log.error("Sets do not match")
        elif isinstance(expected, Mapping) and isinstance(test, Mapping):
            for k, v2 in unwrap(expected).items():
                v1 = test.get(k)
                assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta)
        elif isinstance(expected, Mapping):
            for k, v2 in expected.items():
                if isinstance(k, basestring):
                    v1 = mo_dots.get_attr(test, literal_field(k))
                else:
                    v1 = test[k]
                assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta)
        elif isinstance(test, (set, list)) and isinstance(expected, set):
            test = set(test)
            if len(test) != len(expected):
                Log.error(
                    "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}",
                    test=test,
                    expected=expected
                )

            for e in expected:
                for t in test:
                    try:
                        assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta)
                        break
                    except Exception, _:
                        pass
                else:
                    Log.error("Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test)

        elif isinstance(expected, types.FunctionType):
            return expected(test)
Пример #24
0
def select(data, field_name):
    """
    return list with values from field_name
    """
    if isinstance(data, Cube):
        return data._select(_normalize_selects(field_name))

    if isinstance(data, PartFlatList):
        return data.select(field_name)

    if isinstance(data, UniqueIndex):
        data = (
            data._data.values()
        )  # THE SELECT ROUTINE REQUIRES dicts, NOT Data WHILE ITERATING

    if is_data(data):
        return select_one(data, field_name)

    if is_data(field_name):
        field_name = wrap(field_name)
        if field_name.value in ["*", "."]:
            return data

        if field_name.value:
            # SIMPLIFY {"value":value} AS STRING
            field_name = field_name.value

    # SIMPLE PYTHON ITERABLE ASSUMED
    if is_text(field_name):
        path = split_field(field_name)
        if len(path) == 1:
            return FlatList([d[field_name] for d in data])
        else:
            output = FlatList()
            flat_list._select1(data, path, 0, output)
            return output
    elif is_list(field_name):
        keys = [_select_a_field(wrap(f)) for f in field_name]
        return _select(Data(), unwrap(data), keys, 0)
    else:
        keys = [_select_a_field(field_name)]
        return _select(Data(), unwrap(data), keys, 0)
Пример #25
0
def params_pack(params, *args):
    settings = {}
    for a in args:
        for k, v in a.items():
            k = text_type(k)
            if k in settings:
                continue
            settings[k] = v

    output = {str(k): unwrap(settings[k]) for k in params if k in settings}
    return output
Пример #26
0
 def __init__(self, name, data, schema=None):
     # TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION
     data = list(unwrap(data))
     Container.__init__(self)
     if schema == None:
         self._schema = get_schema_from_list(name, data)
     else:
         self._schema = schema
     self.name = name
     self.data = data
     self.locker = Lock()  # JUST IN CASE YOU WANT TO DO MORE THAN ONE THING
Пример #27
0
 def __setitem__(self, i, y):
     try:
         _list = _get_list(self)
         if i <= len(_list):
             for i in range(len(_list), i):
                 _list.append(None)
         _list[i] = unwrap(y)
     except Exception as e:
         if not Log:
             _late_import()
         Log.error("problem", cause=e)
Пример #28
0
def sort(data, fieldnames=None, already_normalized=False):
    """
    PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction}
    """
    try:
        if data == None:
            return Null

        if not fieldnames:
            return wrap(sort_using_cmp(data, value_compare))

        if already_normalized:
            formal = fieldnames
        else:
            formal = query._normalize_sort(fieldnames)

        funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal]

        def comparer(left, right):
            for func, sort_ in funcs:
                try:
                    result = value_compare(func(left), func(right), sort_)
                    if result != 0:
                        return result
                except Exception as e:
                    Log.error("problem with compare", e)
            return 0

        if is_list(data):
            output = FlatList([unwrap(d) for d in sort_using_cmp(data, cmp=comparer)])
        elif hasattr(data, "__iter__"):
            output = FlatList(
                [unwrap(d) for d in sort_using_cmp(list(data), cmp=comparer)]
            )
        else:
            Log.error("Do not know how to handle")
            output = None

        return output
    except Exception as e:
        Log.error("Problem sorting\n{{data}}", data=data, cause=e)
Пример #29
0
def reverse(vals):
    # TODO: Test how to do this fastest
    if not hasattr(vals, "len"):
        vals = list(vals)
    l = len(vals)
    output = [None] * l

    for v in unwrap(vals):
        l -= 1
        output[l] = v

    return wrap(output)
Пример #30
0
def argparse(defs):
    parser = _ArgParser()
    for d in listwrap(defs):
        args = d.copy()
        name = args.name
        args.name = None
        parser.add_argument(*unwrap(listwrap(name)), **args)
    namespace, unknown = parser.parse_known_args()
    if unknown:
        Log.warning("Ignoring arguments: {{unknown|json}}", unknown=unknown)
    output = {k: getattr(namespace, k) for k in vars(namespace)}
    return wrap(output)
Пример #31
0
def _iadd(self, other):
    if not _get(other, CLASS) in data_types:
        get_logger().error("Expecting a Mapping")
    d = unwrap(self)
    for ok, ov in other.items():
        sv = d.get(ok)
        if sv == None:
            d[ok] = deepcopy(ov)
        elif isinstance(ov, (Decimal, float, long, int)):
            if _get(sv, CLASS) in data_types:
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=_get(sv, CLASS).__name__,
                    otype=_get(ov, CLASS).__name__
                )
            elif is_list(sv):
                d[ok].append(ov)
            else:
                d[ok] = sv + ov
        elif is_list(ov):
            d[ok] = listwrap(sv) + ov
        elif _get(ov, CLASS) in data_types:
            if _get(sv, CLASS) in data_types:
                _iadd(sv, ov)
            elif is_list(sv):
                d[ok].append(ov)
            else:
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=_get(sv, CLASS).__name__,
                    otype=_get(ov, CLASS).__name__
                )
        else:
            if _get(sv, CLASS) in data_types:
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=_get(sv, CLASS).__name__,
                    otype=_get(ov, CLASS).__name__
                )
            else:
                d[ok].append(ov)
    return self
Пример #32
0
def _iadd(self, other):
    if not isinstance(other, Mapping):
        get_logger().error("Expecting a Mapping")
    d = unwrap(self)
    for ok, ov in other.items():
        sv = d.get(ok)
        if sv == None:
            d[ok] = deepcopy(ov)
        elif isinstance(ov, (Decimal, float, long, int)):
            if isinstance(sv, Mapping):
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=sv.__class__.__name__,
                    otype=ov.__class__.__name__
                )
            elif isinstance(sv, list):
                d[ok].append(ov)
            else:
                d[ok] = sv + ov
        elif isinstance(ov, list):
            d[ok] = listwrap(sv) + ov
        elif isinstance(ov, Mapping):
            if isinstance(sv, Mapping):
                _iadd(sv, ov)
            elif isinstance(sv, list):
                d[ok].append(ov)
            else:
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=sv.__class__.__name__,
                    otype=ov.__class__.__name__
                )
        else:
            if isinstance(sv, Mapping):
                get_logger().error(
                    "can not add {{stype}} with {{otype}",
                    stype=sv.__class__.__name__,
                    otype=ov.__class__.__name__
                )
            else:
                d[ok].append(ov)
    return self
Пример #33
0
    def __init__(
        self,
        host,
        user=None,
        port=None,
        config=None,
        gateway=None,
        forward_agent=None,
        connect_timeout=None,
        connect_kwargs=None,
        inline_ssh_env=None,
        key_filename=None,  # part of connect_kwargs
        kwargs=None,
    ):
        connect_kwargs = set_default(
            {}, connect_kwargs, {"key_filename": File(key_filename).abspath}
        )

        self.stdout = LogStream(host, "stdout")
        self.stderr = LogStream(host, "stderr")
        config = Config(**unwrap(set_default(
            {},
            config,
            {"overrides": {"run": {
                # "hide": True,
                "out_stream": self.stdout,
                "err_stream": self.stderr,
            }}},
        )))

        self.warn = False
        self.conn = _Connection(
            host,
            user,
            port,
            config,
            gateway,
            forward_agent,
            connect_timeout,
            connect_kwargs,
            inline_ssh_env,
        )
Пример #34
0
    def __init__(
            self,
            host,
            index,
            type=None,
            alias=None,
            name=None,
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self, None)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                           kwargs=kwargs)
        else:
            self._es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self.meta = FromESMetadata(kwargs=kwargs)
        self.settings.type = self._es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.meta.get_columns(
            table_name=coalesce(name, alias, index))
        self._schema = Schema(coalesce(name, alias, index), columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = any(c.es_column.find(".$") != -1 for c in columns)
        else:
            self.typed = typed
Пример #35
0
def drill_filter(esfilter, data):
    """
    PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN

    TODO:  FIX THIS MONUMENALLY BAD IDEA
    """
    esfilter = unwrap(esfilter)
    primary_nested = []  # track if nested, changes if not
    primary_column = []  # only one path allowed
    primary_branch = [
    ]  # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree

    def parse_field(fieldname, data, depth):
        """
        RETURN (first, rest) OF fieldname
        """
        col = split_field(fieldname)
        d = data
        for i, c in enumerate(col):
            try:
                d = d[c]
            except Exception, e:
                Log.error("{{name}} does not exist", name=fieldname)
            if isinstance(d, list) and len(col) > 1:
                if len(primary_column) <= depth + i:
                    primary_nested.append(True)
                    primary_column.append(c)
                    primary_branch.append(d)
                elif primary_nested[depth] and primary_column[depth + i] != c:
                    Log.error("only one branch of tree allowed")
                else:
                    primary_nested[depth + i] = True
                    primary_column[depth + i] = c
                    primary_branch[depth + i] = d

                return c, join_field(col[i + 1:])
            else:
                if len(primary_column) <= depth + i:
                    primary_nested.append(False)
                    primary_column.append(c)
                    primary_branch.append([d])
        return fieldname, None
Пример #36
0
def leaves(value, prefix=None):
    """
    LIKE items() BUT RECURSIVE, AND ONLY FOR THE LEAVES (non dict) VALUES
    SEE wrap_leaves FOR THE INVERSE

    :param value: THE Mapping TO TRAVERSE
    :param prefix:  OPTIONAL PREFIX GIVEN TO EACH KEY
    :return: Data, WHICH EACH KEY BEING A PATH INTO value TREE
    """
    prefix = coalesce(prefix, "")
    output = []
    for k, v in value.items():
        try:
            if isinstance(v, Mapping):
                output.extend(leaves(v, prefix=prefix + literal_field(k) + "."))
            else:
                output.append((prefix + literal_field(k), unwrap(v)))
        except Exception as e:
            get_logger().error("Do not know how to handle", cause=e)
    return output
Пример #37
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed
Пример #38
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    query_path = schema.query_path[0]
    selects = listwrap(query.select)

    acc, decoders, es_query = build_es_query(selects, query_path, schema,
                                             query)

    with Timer("ES query time", verbose=DEBUG) as es_duration:
        result = es.search(es_query)

    # Log.note("{{result}}", result=result)

    try:
        format_time = Timer("formatting", verbose=DEBUG)
        with format_time:
            # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)
            # IT APPEARS THE OLD doc_count IS GONE
            aggs = unwrap(result.aggregations)

            edges_formatter, groupby_formatter, value_fomratter, mime_type = agg_formatters[
                query.format]
            if query.edges:
                output = edges_formatter(aggs, acc, query, decoders, selects)
            elif query.groupby:
                output = groupby_formatter(aggs, acc, query, decoders, selects)
            else:
                output = value_fomratter(aggs, acc, query, decoders, selects)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in agg_formatters:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", cause=e)
Пример #39
0
    def __eq__(self, other):
        if self is other:
            return True

        d = _get(self, "_dict")
        if not isinstance(d, dict):
            return d == other

        if other == None:
            return False

        if not isinstance(other, Mapping):
            return False
        e = unwrap(other)
        for k, v in d.items():
            if e.get(k) != v:
                return False
        for k, v in e.items():
            if d.get(k) != v:
                return False
        return True
Пример #40
0
    def __eq__(self, other):
        if self is other:
            return True

        d = self._internal_dict
        if _get(d, CLASS) is not dict:
            return d == other

        if not d and other == None:
            return False

        if _get(other, CLASS) not in data_types:
            return False
        e = unwrap(other)
        for k, v in d.items():
            if e.get(k) != v:
                return False
        for k, v in e.items():
            if d.get(k) != v:
                return False
        return True
Пример #41
0
    def error(
            cls,
            template,  # human readable template
            default_params={},  # parameters for template
            cause=None,  # pausible cause
            stack_depth=0,
            **more_params):
        """
        raise an exception with a trace for the cause too

        :param template: *string* human readable string with placeholders for parameters
        :param default_params: *dict* parameters to fill in template
        :param cause: *Exception* for chaining
        :param stack_depth:  *int* how many calls you want popped off the stack to report the *true* caller
        :param log_context: *dict* extra key:value pairs for your convenience
        :param more_params: *any more parameters (which will overwrite default_params)
        :return:
        """
        if not isinstance(template, unicode):
            sys.stderr.write("Log.error was expecting a unicode template")
            Log.error("Log.error was expecting a unicode template")

        if default_params and isinstance(
                listwrap(default_params)[0], BaseException):
            cause = default_params
            default_params = {}

        params = dict(unwrap(default_params), **more_params)

        add_to_trace = False
        cause = wrap(
            unwraplist(
                [Except.wrap(c, stack_depth=1) for c in listwrap(cause)]))
        trace = exceptions.extract_stack(stack_depth + 1)

        if add_to_trace:
            cause[0].trace.extend(trace[1:])

        e = Except(exceptions.ERROR, template, params, cause, trace)
        raise e
Пример #42
0
    def warning(
        cls,
        template,
        default_params={},
        cause=None,
        stack_depth=0,
        log_context=None,
        **more_params
    ):
        """
        :param template: *string* human readable string with placeholders for parameters
        :param default_params: *dict* parameters to fill in template
        :param cause: *Exception* for chaining
        :param stack_depth:  *int* how many calls you want popped off the stack to report the *true* caller
        :param log_context: *dict* extra key:value pairs for your convenience
        :param more_params: *any more parameters (which will overwrite default_params)
        :return:
        """
        if not isinstance(template, text_type):
            Log.error("Log.warning was expecting a unicode template")

        if isinstance(default_params, BaseException):
            cause = default_params
            default_params = {}

        if "values" in more_params.keys():
            Log.error("Can not handle a logging parameter by name `values`")
        params = dict(unwrap(default_params), **more_params)
        cause = unwraplist([Except.wrap(c) for c in listwrap(cause)])
        trace = exceptions.extract_stack(stack_depth + 1)

        e = Except(type=exceptions.WARNING, template=template, params=params, cause=cause, trace=trace)
        Log.note(
            "{{error|unicode}}",
            error=e,
            log_context=set_default({"context": exceptions.WARNING}, log_context),
            stack_depth=stack_depth + 1
        )
Пример #43
0
    def add(self, val):
        val = datawrap(val)
        key = value2key(self._keys, val)
        if key == None:
            Log.error("Expecting key to be not None")

        d = self._data.get(key)
        if d is None:
            self._data[key] = unwrap(val)
            self.count += 1
        elif d is not val:
            if self.fail_on_dup:
                Log.error(
                    "{{new|json}} with key {{key|json}} already filled with {{old|json}}",
                    key=key,
                    new=val,
                    old=self[val])
            elif DEBUG:
                Log.warning(
                    "key {{key|json}} already filled\nExisting\n{{existing|json|indent}}\nValue\n{{value|json|indent}}",
                    key=key,
                    existing=d,
                    value=val)
Пример #44
0
    def _insert(self, collection):
        for nested_path, details in collection.items():
            active_columns = wrap(list(details.active_columns))
            rows = details.rows
            num_rows = len(rows)
            table_name = concat_field(self.name, nested_path)

            if table_name == self.name:
                # DO NOT REQUIRE PARENT OR ORDER COLUMNS
                meta_columns = [GUID, UID]
            else:
                meta_columns = [UID, PARENT, ORDER]

            all_columns = meta_columns + active_columns.es_column  # ONLY THE PRIMITIVE VALUE COLUMNS
            command = ConcatSQL(
                SQL_INSERT, quote_column(table_name),
                sql_iso(sql_list(map(quote_column, all_columns))), SQL_VALUES,
                sql_list(
                    sql_iso(
                        sql_list(quote_value(row.get(c)) for c in all_columns))
                    for row in unwrap(rows)))

            with self.db.transaction() as t:
                t.execute(command)
Пример #45
0
    def _insert(self, collection):
        for nested_path, details in collection.items():
            active_columns = wrap(list(details.active_columns))
            rows = details.rows
            table_name = concat_field(self.sf.fact, nested_path)

            if table_name == self.sf.fact:
                # DO NOT REQUIRE PARENT OR ORDER COLUMNS
                meta_columns = [GUID, UID]
            else:
                meta_columns = [UID, PARENT, ORDER]

            all_columns = meta_columns + active_columns.es_column

            prefix = "INSERT INTO " + quote_table(table_name) + \
                     "(" + ",".join(map(quote_table, all_columns)) + ")"

            # BUILD THE RECORDS
            records = " UNION ALL ".join(
                "\nSELECT " +
                ",".join(quote_value(row.get(c)) for c in all_columns)
                for row in unwrap(rows))

            self.db.execute(prefix + records)
Пример #46
0
def tuple(data, field_name):
    """
    RETURN LIST  OF TUPLES
    """
    if isinstance(data, Cube):
        Log.error("not supported yet")

    if isinstance(data, FlatList):
        Log.error("not supported yet")

    if is_data(field_name) and "value" in field_name:
        # SIMPLIFY {"value":value} AS STRING
        field_name = field_name["value"]

    # SIMPLE PYTHON ITERABLE ASSUMED
    if is_text(field_name):
        if len(split_field(field_name)) == 1:
            return [(d[field_name], ) for d in data]
        else:
            path = split_field(field_name)
            output = []
            for d in data:
                for p in path:
                    d = _getdefault(d, p)
                    output.append((d, ))
            return output
    elif is_list(field_name):
        paths = [_select_a_field(f) for f in field_name]
        output = FlatList()
        _tuple((), unwrap(data), paths, 0, output)
        return output
    else:
        paths = [_select_a_field(field_name)]
        output = FlatList()
        _tuple((), data, paths, 0, output)
        return output
Пример #47
0
    if retry == None:
        retry = Data(times=1, sleep=0)
    elif isinstance(retry, Number):
        retry = Data(times=retry, sleep=1)
    else:
        retry = wrap(retry)
        if isinstance(retry.sleep, Duration):
            retry.sleep = retry.sleep.seconds
        set_default(retry, {"times": 1, "sleep": 0})

    if b'json' in kwargs:
        kwargs[b'data'] = convert.value2json(kwargs[b'json']).encode("utf8")
        del kwargs[b'json']

    try:
        headers = kwargs[b"headers"] = unwrap(
            coalesce(wrap(kwargs)[b"headers"], {}))
        set_default(headers, {b"accept-encoding": b"compress, gzip"})

        if zip and len(coalesce(kwargs.get(b"data"))) > 1000:
            compressed = convert.bytes2zip(kwargs[b"data"])
            headers[b'content-encoding'] = b'gzip'
            kwargs[b"data"] = compressed

            _to_ascii_dict(headers)
        else:
            _to_ascii_dict(headers)
    except Exception, e:
        Log.error("Request setup failure on {{url}}", url=url, cause=e)

    errors = []
    for r in range(retry.times):
Пример #48
0
def to_python(self, not_null=False, boolean=False, many=False):
    return text_type(repr(unwrap(json2value(self.json))))
Пример #49
0
def get_selects(query):
    schema = query.frum.schema
    split_select = {".": ESSelect(".")}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()
    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var)
                )
                if c.jx_type == NESTED:
                    get_select(".").set_op = True
                    new_select.append(
                        {
                            "name": full_name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": literal_field(full_name),
                                "index": put_index,
                                "child": ".",
                            },
                            "pull": get_pull_source(c.es_column),
                        }
                    )
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append(
                        {
                            "name": full_name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": literal_field(full_name),
                                "index": put_index,
                                "child": ".",
                            },
                        }
                    )
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select(".").set_op = True
                new_select.append(
                    {
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": put_index, "child": "."},
                        "pull": get_pull_source("."),
                    }
                )
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select(".").set_op = True
                    for c in leaves:
                        if (
                            len(c.nested_path) == 1
                        ):  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n) for n in split_field(c.name)
                            )
                            new_select.append(
                                {
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": untype_path(
                                            relative_field(pre_child, s_column)
                                        ),
                                    },
                                    "pull": get_pull_source(c.es_column),
                                }
                            )
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": ".",
                                        },
                                        "pull": lambda row: row._id,
                                    }
                                )
                            elif c.jx_type == NESTED:
                                get_select(".").set_op = True
                                pre_child = join_field(
                                    decode_property(n) for n in split_field(c.name)
                                )
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": untype_path(
                                                relative_field(pre_child, s_column)
                                            ),
                                        },
                                        "pull": get_pull_source(c.es_column),
                                    }
                                )
                            else:
                                get_select(c_nested_path).fields.append(c.es_column)
                                pre_child = join_field(
                                    decode_property(n) for n in split_field(c.name)
                                )
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": untype_path(
                                                relative_field(pre_child, s_column)
                                            ),
                                        },
                                    }
                                )
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name, schema.query_path[0])
                                ),
                                s_column,
                            )
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(s_column, unnest_path(c_nested_path))
                                ),
                            )
                            new_select.append(
                                {
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child,
                                    },
                                    "pull": pull,
                                }
                            )
            else:
                new_select.append(
                    {
                        "name": select.name,
                        "value": Variable("$dummy"),
                        "put": {"name": select.name, "index": put_index, "child": "."},
                    }
                )
            put_index += 1
        else:
            split_scripts = split_expression_by_path(
                select.value, schema, lang=Painless
            )
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script": text(
                        Painless[first(script)].partial_eval().to_es_script(schema)
                    )
                }
                new_select.append(
                    {
                        "name": select.name,
                        "pull": jx_expression_to_function(
                            "fields." + literal_field(select.name)
                        ),
                        "put": {"name": select.name, "index": put_index, "child": "."},
                    }
                )
                put_index += 1
    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select(".").set_op:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var))
                )
        else:
            Log.error("Do not know what to do")
    return new_select, split_select
Пример #50
0
def _where_terms(master, where, schema):
    """
    USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
    master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
    """
    if isinstance(where, Mapping):
        if where.term:
            # MAP TERM
            try:
                output = _map_term_using_schema(master, [], where.term,
                                                schema.edges)
                return output
            except Exception as e:
                Log.error("programmer problem?", e)
        elif where.terms:
            # MAP TERM
            output = FlatList()
            for k, v in where.terms.items():
                if not isinstance(v, (list, set)):
                    Log.error("terms filter expects list of values")
                edge = schema.edges[k]
                if not edge:
                    output.append({"terms": {k: v}})
                else:
                    if isinstance(edge, basestring):
                        # DIRECT FIELD REFERENCE
                        return {"terms": {edge: v}}
                    try:
                        domain = edge.getDomain()
                    except Exception as e:
                        Log.error("programmer error", e)
                    fields = domain.dimension.fields
                    if isinstance(fields, Mapping):
                        or_agg = []
                        for vv in v:
                            and_agg = []
                            for local_field, es_field in fields.items():
                                vvv = vv[local_field]
                                if vvv != None:
                                    and_agg.append({"term": {es_field: vvv}})
                            or_agg.append({"and": and_agg})
                        output.append({"or": or_agg})
                    elif isinstance(
                            fields,
                            list) and len(fields) == 1 and is_variable_name(
                                fields[0]):
                        output.append({"terms": {fields[0]: v}})
                    elif domain.partitions:
                        output.append({
                            "or":
                            [domain.getPartByKey(vv).esfilter for vv in v]
                        })
            return {"and": output}
        elif where["or"]:
            return {
                "or": [
                    unwrap(_where_terms(master, vv, schema))
                    for vv in where["or"]
                ]
            }
        elif where["and"]:
            return {
                "and": [
                    unwrap(_where_terms(master, vv, schema))
                    for vv in where["and"]
                ]
            }
        elif where["not"]:
            return {"not": unwrap(_where_terms(master, where["not"], schema))}
    return where
Пример #51
0
 def extend(self, values):
     for v in values:
         _get(self, "list").append(unwrap(v))
     return self
Пример #52
0
def pretty_json(value):
    try:
        if value is False:
            return "false"
        elif value is True:
            return "true"
        elif is_data(value):
            try:
                value = unwrap(value)
                items = sort_using_key(value.items(), lambda r: r[0])
                values = [
                    encode_basestring(k) + PRETTY_COLON + pretty_json(v)
                    for k, v in items if v != None
                ]
                if not values:
                    return "{}"
                elif len(values) == 1:
                    return "{" + values[0] + "}"
                else:
                    return "{\n" + ",\n".join(indent(v)
                                              for v in values) + "\n}"
            except Exception as e:
                from mo_logs import Log
                from mo_math import OR

                if OR(not is_text(k) for k in value.keys()):
                    Log.error("JSON must have string keys: {{keys}}:",
                              keys=[k for k in value.keys()],
                              cause=e)

                Log.error("problem making dict pretty: keys={{keys}}:",
                          keys=[k for k in value.keys()],
                          cause=e)
        elif value in (None, Null):
            return "null"
        elif value.__class__ in (binary_type, text):
            if is_binary(value):
                value = value.decode('utf8')
            try:
                if "\n" in value and value.strip():
                    return pretty_json({
                        "$concat": value.split("\n"),
                        "separator": "\n"
                    })
                else:
                    return quote(value)
            except Exception as e:
                from mo_logs import Log

                try:
                    Log.note(
                        "try explicit convert of string with length {{length}}",
                        length=len(value))
                    acc = [QUOTE]
                    for c in value:
                        try:
                            try:
                                c2 = ESCAPE_DCT[c]
                            except Exception:
                                c2 = c
                            c3 = text(c2)
                            acc.append(c3)
                        except BaseException:
                            pass
                            # Log.warning("odd character {{ord}} found in string.  Ignored.",  ord= ord(c)}, cause=g)
                    acc.append(QUOTE)
                    output = u"".join(acc)
                    Log.note("return value of length {{length}}",
                             length=len(output))
                    return output
                except BaseException as f:
                    Log.warning("can not convert {{type}} to json",
                                type=f.__class__.__name__,
                                cause=f)
                    return "null"
        elif is_list(value):
            if not value:
                return "[]"

            if ARRAY_MAX_COLUMNS == 1:
                return "[\n" + ",\n".join(
                    [indent(pretty_json(v)) for v in value]) + "\n]"

            if len(value) == 1:
                j = pretty_json(value[0])
                if j.find("\n") >= 0:
                    return "[\n" + indent(j) + "\n]"
                else:
                    return "[" + j + "]"

            js = [pretty_json(v) for v in value]
            max_len = max(*[len(j) for j in js])
            if max_len <= ARRAY_ITEM_MAX_LENGTH and max(
                    *[j.find("\n") for j in js]) == -1:
                # ALL TINY VALUES
                num_columns = max(
                    1,
                    min(
                        ARRAY_MAX_COLUMNS,
                        int(
                            floor((ARRAY_ROW_LENGTH + 2.0) /
                                  float(max_len +
                                        2)))))  # +2 TO COMPENSATE FOR COMMAS
                if len(js) <= num_columns:  # DO NOT ADD \n IF ONLY ONE ROW
                    return "[" + PRETTY_COMMA.join(js) + "]"
                if num_columns == 1:  # DO NOT rjust IF THERE IS ONLY ONE COLUMN
                    return "[\n" + ",\n".join(
                        [indent(pretty_json(v)) for v in value]) + "\n]"

                content = ",\n".join(
                    PRETTY_COMMA.join(
                        j.rjust(max_len) for j in js[r:r + num_columns])
                    for r in xrange(0, len(js), num_columns))
                return "[\n" + indent(content) + "\n]"

            pretty_list = js

            output = ["[\n"]
            for i, p in enumerate(pretty_list):
                try:
                    if i > 0:
                        output.append(",\n")
                    output.append(indent(p))
                except Exception:
                    from mo_logs import Log

                    Log.warning(
                        "problem concatenating string of length {{len1}} and {{len2}}",
                        len1=len("".join(output)),
                        len2=len(p))
            output.append("\n]")
            try:
                return "".join(output)
            except Exception as e:
                from mo_logs import Log

                Log.error("not expected", cause=e)
        elif hasattr(value, '__data__'):
            d = value.__data__()
            return pretty_json(d)
        elif hasattr(value, '__json__'):
            j = value.__json__()
            if j == None:
                return "   null   "  # TODO: FIND OUT WHAT CAUSES THIS
            return pretty_json(json_decoder(j))
        elif scrub(value) is None:
            return "null"
        elif hasattr(value, '__iter__'):
            return pretty_json(list(value))
        elif hasattr(value, '__call__'):
            return "null"
        else:
            try:
                if int(value) == value:
                    return text(int(value))
            except Exception:
                pass

            try:
                if float(value) == value:
                    return text(float(value))
            except Exception:
                pass

            return pypy_json_encode(value)

    except Exception as e:
        problem_serializing(value, e)
Пример #53
0
def _normalize_select(select, frum, schema=None):
    """
    :param select: ONE SELECT COLUMN
    :param frum: TABLE TO get_columns()
    :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS
    :return: AN ARRAY OF SELECT COLUMNS
    """
    if not _Column:
        _late_import()

    if isinstance(select, basestring):
        canonical = select = Data(value=select)
    else:
        select = wrap(select)
        canonical = select.copy()

    canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                   select.aggregate, "none")
    canonical.default = coalesce(
        select.default, canonical_aggregates[canonical.aggregate].default)

    if hasattr(unwrap(frum), "_normalize_select"):
        return frum._normalize_select(canonical)

    output = []
    if not select.value or select.value == ".":
        output.extend([
            set_default({
                "name": c.name,
                "value": jx_expression(c.name)
            }, canonical) for c in frum.get_leaves()
        ])
    elif isinstance(select.value, basestring):
        if select.value.endswith(".*"):
            base_name = select.value[:-2]
            canonical.name = coalesce(select.name, base_name, select.aggregate)
            value = jx_expression(select[:-2])
            if not isinstance(value, Variable):
                Log.error("`*` over general expression not supported yet")
                output.append([
                    set_default(
                        {
                            "name": base_name,
                            "value": LeavesOp("leaves", value),
                            "format": "dict"  # MARKUP FOR DECODING
                        },
                        canonical) for c in frum.get_columns()
                    if c.type not in STRUCT
                ])
            else:
                output.extend([
                    set_default(
                        {
                            "name":
                            base_name + "." +
                            literal_field(c.name[len(base_name) + 1:]),
                            "value":
                            jx_expression(c.name)
                        }, canonical) for c in frum.get_leaves()
                    if c.name.startswith(base_name + ".")
                ])
        else:
            canonical.name = coalesce(select.name, select.value,
                                      select.aggregate)
            canonical.value = jx_expression(select.value)
            output.append(canonical)

    output = wrap(output)
    if any(n == None for n in output.name):
        Log.error("expecting select to have a name: {{select}}", select=select)
    return output
Пример #54
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not startswith_field(query['from'], self.sf.fact):
            Log.error("Expecting table, or some nested table")
        frum, query['from'] = query['from'], self
        table = self.sf.tables[relative_field(frum, self.sf.fact)]
        schema = table.schema
        query = QueryOp.wrap(query, table=table, schema=schema)
        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = "CREATE TABLE " + quote_column(new_table) + " AS "
        else:
            create_table = ""

        if query.groupby and query.format != "cube":
            op, index_to_columns = self._groupby_op(query, frum)
            command = create_table + op
        elif query.groupby:
            query.edges, query.groupby = query.groupby, query.edges
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
            query.edges, query.groupby = query.groupby, query.edges
        elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
        else:
            op = self._set_op(query, frum)
            return op

        result = self.db.query(command)

        if query.format == "container":
            output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name

            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0]))
                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    data=unwrap(data),
                    select=select,
                    meta={"format": "cube"}
                )

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif isinstance(e.value, TupleOp):
                        pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name],
                                        "push_child").pull
                        parts = [tuple(p(d) for p in pulls) for d in result.data]
                        domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(Data(
                        name=e.name,
                        allowNulls=allowNulls,
                        domain=domain
                    ))

                data = {}
                for si, s in enumerate(listwrap(query.select)):
                    if s.aggregate == "count":
                        data[s.name] = Matrix(dims=dims, zeros=0)
                    else:
                        data[s.name] = Matrix(dims=dims)

                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    meta={"format": "cube"},
                    edges=edges,
                    select=select,
                    data={k: v.cube for k, v in data.items()}
                )

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif isinstance(e.value, TupleOp):
                    pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = zip(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}

                    if query.sort[i].sort == -1:
                        domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True)))
                    else:
                        domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(Data(
                    name=e.name,
                    allowNulls=allowNulls,
                    domain=domain
                ))

            data_cubes = {}
            for si, s in enumerate(listwrap(query.select)):
                if s.aggregate == "count":
                    data_cubes[s.name] = Matrix(dims=dims, zeros=0)
                else:
                    data_cubes[s.name] = Matrix(dims=dims)

            r2c = index_to_coordinate(dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(row)

            if query.select == None:
                select = Null
            elif isinstance(query.select, list):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(
                meta={"format": "cube"},
                edges=edges,
                select=select,
                data={k: v.cube for k, v in data_cubes.items()}
            )
        elif query.format == "table" or (not query.format and query.groupby):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(
                meta={"format": "table"},
                header=column_names,
                data=data
            )
        elif query.format == "list" or (not query.edges and not query.groupby):
            if not query.edges and not query.groupby and any(listwrap(query.select).aggregate):
                if isinstance(query.select, list):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            if data[c.push_name] == None:
                                data[c.push_name] = c.pull(result.data[0])
                            elif isinstance(data[c.push_name], list):
                                data[c.push_name].append(c.pull(result.data[0]))
                            else:
                                data[c.push_name] = [data[c.push_name], c.pull(result.data[0])]
                        else:
                            data[c.push_name][c.push_child] = c.pull(result.data[0])

                    output = Data(
                        meta={"format": "value"},
                        data=data
                    )
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        if not data[s.push_child]:
                            data[s.push_child] = s.pull(result.data[0])
                        else:
                            data[s.push_child] += [s.pull(result.data[0])]
                    output = Data(
                        meta={"format": "value"},
                        data=unwrap(data)
                    )
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(
                    meta={"format": "list"},
                    data=data
                )
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
Пример #55
0
def _normalize_select(select, frum, schema=None):
    """
    :param select: ONE SELECT COLUMN
    :param frum: TABLE TO get_columns()
    :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS
    :return: AN ARRAY OF SELECT COLUMNS
    """
    if not _Column:
        _late_import()

    if is_text(select):
        canonical = select = Data(value=select)
    else:
        select = wrap(select)
        canonical = select.copy()

    canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                   select.aggregate, "none")
    canonical.default = coalesce(
        select.default, canonical_aggregates[canonical.aggregate].default)

    if hasattr(unwrap(frum), "_normalize_select"):
        return frum._normalize_select(canonical)

    output = []

    if len(select) and not select.value:
        Log.error(BAD_SELECT, select=select)
    elif not select.value or select.value == ".":
        output.extend([
            set_default(
                {
                    "name": c.name,
                    "value": jx_expression(c.name, schema=schema)
                }, canonical) for c in frum.get_leaves()
        ])
    elif is_text(select.value):
        if select.value.endswith(".*"):
            canonical.name = coalesce(select.name, ".")
            value = jx_expression(select[:-2], schema=schema)
            if not is_op(value, Variable):
                Log.error("`*` over general expression not supported yet")
                output.append([
                    set_default(
                        {
                            "value": LeavesOp(value, prefix=select.prefix),
                            "format": "dict"  # MARKUP FOR DECODING
                        },
                        canonical) for c in frum.get_columns()
                    if c.jx_type not in STRUCT
                ])
            else:
                Log.error("do not know what to do")
        else:
            canonical.name = coalesce(select.name, select.value,
                                      select.aggregate)
            canonical.value = jx_expression(select.value, schema=schema)
            output.append(canonical)

    output = wrap(output)
    if any(n == None for n in output.name):
        Log.error("expecting select to have a name: {{select}}", select=select)
    return output
Пример #56
0
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None):
    show_detail = True
    test = unwrap(test)
    expected = unwrap(expected)
    try:
        if test is None and expected is None:
            return
        elif test is expected:
            return
        elif is_text(expected):
            assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta)
        elif isinstance(test, UniqueIndex):
            if test ^ expected:
                Log.error("Sets do not match")
        elif is_data(expected) and is_data(test):
            for k, v2 in unwrap(expected).items():
                v1 = test.get(k)
                assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta)
        elif is_data(expected):
            for k, v2 in expected.items():
                if is_text(k):
                    v1 = mo_dots.get_attr(test, literal_field(k))
                else:
                    v1 = test[k]
                assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta)
        elif is_container(test) and isinstance(expected, set):
            test = set(wrap(t) for t in test)
            if len(test) != len(expected):
                Log.error(
                    "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}",
                    test=test,
                    expected=expected
                )

            for e in expected:
                for t in test:
                    try:
                        assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta)
                        break
                    except Exception as _:
                        pass
                else:
                    Log.error("Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test)

        elif isinstance(expected, types.FunctionType):
            return expected(test)
        elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"):
            if test.__class__.__name__ == "ndarray":  # numpy
                test = test.tolist()
            elif test.__class__.__name__ == "DataFrame":  # pandas
                test = test[test.columns[0]].values.tolist()
            elif test.__class__.__name__ == "Series":  # pandas
                test = test.values.tolist()

            if not expected and test == None:
                return
            if expected == None:
                expected = []  # REPRESENT NOTHING
            for a, b in zip_longest(test, expected):
                assertAlmostEqual(a, b, msg=msg, digits=digits, places=places, delta=delta)
        else:
            assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta)
    except Exception as e:
        Log.error(
            "{{test|json|limit(10000)}} does not match expected {{expected|json|limit(10000)}}",
            test=test if show_detail else "[can not show]",
            expected=expected if show_detail else "[can not show]",
            cause=e
        )
Пример #57
0
        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.sf.fact, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if not isinstance(data, Mapping):
                data = {".": data}
            for k, v in data.items():
                insertion = doc_collection[nested_path[0]]
                cname = concat_field(full_path, literal_field(k))
                value_type = get_type(v)
                if value_type is None:
                    continue

                if value_type in STRUCT:
                    c = unwraplist(
                        [cc for cc in abs_schema[cname] if cc.type in STRUCT])
                else:
                    c = unwraplist([
                        cc for cc in abs_schema[cname] if cc.type == value_type
                    ])

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path, _ in nested_tables.items():
                        if startswith_field(
                                cname,
                                path) and len(deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(names={".": cname},
                               type=value_type,
                               es_column=typed_column(cname, value_type),
                               es_index=table,
                               nested_path=nested_path)
                    abs_schema.add(cname, c)
                    if value_type == "nested":
                        nested_tables[cname] = "fake table"

                    required_changes.append({"add": c})

                    # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY
                    insertion.active_columns.add(c)
                elif c.type == "nested" and value_type == "object":
                    value_type = "nested"
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    abs_schema.remove(cname, c)
                    required_changes.append({"nest": (c, nested_path[0])})
                    deep_c = Column(names={".": cname},
                                    type=value_type,
                                    es_column=typed_column(cname, value_type),
                                    es_index=table,
                                    nested_path=nested_path)
                    abs_schema.add(cname, deep_c)
                    insertion.active_columns.add(deep_c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)

                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {UID: self.next_uid(), PARENT: uid, ORDER: order}
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if value_type == "nested":
                    row[c.es_column] = "."
                    deeper_nested_path = [cname] + nested_path
                    insertion = doc_collection.get(cname, None)
                    if not insertion:
                        insertion = doc_collection[cname] = Data(
                            active_columns=set(), rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif value_type == "object":
                    row[c.es_column] = "."
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.type:
                    row[c.es_column] = v
Пример #58
0
def drill_filter(esfilter, data):
    """
    PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN

    TODO:  FIX THIS MONUMENTALLY BAD IDEA
    """
    esfilter = unwrap(esfilter)
    primary_nested = []  # track if nested, changes if not
    primary_column = []  # only one path allowed
    primary_branch = (
        []
    )  # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree

    def parse_field(fieldname, data, depth):
        """
        RETURN (first, rest) OF fieldname
        """
        col = split_field(fieldname)
        d = data
        for i, c in enumerate(col):
            try:
                d = d[c]
            except Exception as e:
                Log.error("{{name}} does not exist", name=fieldname)
            if is_list(d) and len(col) > 1:
                if len(primary_column) <= depth + i:
                    primary_nested.append(True)
                    primary_column.append(c)
                    primary_branch.append(d)
                elif primary_nested[depth] and primary_column[depth + i] != c:
                    Log.error("only one branch of tree allowed")
                else:
                    primary_nested[depth + i] = True
                    primary_column[depth + i] = c
                    primary_branch[depth + i] = d

                return c, join_field(col[i + 1:])
            else:
                if len(primary_column) <= depth + i:
                    primary_nested.append(False)
                    primary_column.append(c)
                    primary_branch.append([d])
        return fieldname, None

    def pe_filter(filter, data, depth):
        """
        PARTIAL EVALUATE THE filter BASED ON data GIVEN
        """
        if filter is TRUE:
            return True
        if filter is FALSE:
            return False

        filter = wrap(filter)

        if filter["and"]:
            result = True
            output = FlatList()
            for a in filter["and"]:
                f = pe_filter(a, data, depth)
                if f is False:
                    result = False
                elif f is not True:
                    output.append(f)
            if result and output:
                return {"and": output}
            else:
                return result
        elif filter["or"]:
            output = FlatList()
            for o in filter["or"]:
                f = pe_filter(o, data, depth)
                if f is True:
                    return True
                elif f is not False:
                    output.append(f)
            if output:
                return {"or": output}
            else:
                return False
        elif filter["not"]:
            f = pe_filter(filter["not"], data, depth)
            if f is True:
                return False
            elif f is False:
                return True
            else:
                return {"not": f}
        elif filter.term or filter.eq:
            eq = coalesce(filter.term, filter.eq)
            result = True
            output = {}
            for col, val in eq.items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d != val:
                        result = False
                else:
                    output[rest] = val

            if result and output:
                return {"term": output}
            else:
                return result
        elif filter.equal:
            a, b = filter["equal"]
            first_a, rest_a = parse_field(a, data, depth)
            first_b, rest_b = parse_field(b, data, depth)
            val_a = data[first_a]
            val_b = data[first_b]
            if not rest_a:
                if not rest_b:
                    if val_a != val_b:
                        return False
                    else:
                        return True
                else:
                    return {"term": {rest_b: val_a}}
            else:
                if not rest_b:
                    return {"term": {rest_a: val_b}}
                else:
                    return {"equal": [rest_a, rest_b]}

        elif filter.terms:
            result = True
            output = {}
            for col, vals in filter["terms"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d not in vals:
                        result = False
                else:
                    output[rest] = vals
            if result and output:
                return {"terms": output}
            else:
                return result

        elif filter.range:
            result = True
            output = {}
            for col, ranges in filter["range"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    for sign, val in ranges.items():
                        if sign in ("gt", ">") and d <= val:
                            result = False
                        if sign == "gte" and d < val:
                            result = False
                        if sign == "lte" and d > val:
                            result = False
                        if sign == "lt" and d >= val:
                            result = False
                else:
                    output[rest] = ranges
            if result and output:
                return {"range": output}
            else:
                return result
        elif filter.missing:
            if is_text(filter.missing):
                field = filter["missing"]
            else:
                field = filter["missing"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d == None:
                    return True
                return False
            else:
                return {"missing": rest}
        elif filter.prefix:
            result = True
            output = {}
            for col, val in filter["prefix"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d == None or not d.startswith(val):
                        result = False
                else:
                    output[rest] = val
            if result and output:
                return {"prefix": output}
            else:
                return result

        elif filter.exists:
            if is_text(filter["exists"]):
                field = filter["exists"]
            else:
                field = filter["exists"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d != None:
                    return True
                return False
            else:
                return {"exists": rest}
        else:
            Log.error("Can not interpret esfilter: {{esfilter}}",
                      {"esfilter": filter})

    output = []  # A LIST OF OBJECTS MAKING THROUGH THE FILTER

    def main(sequence, esfilter, row, depth):
        """
        RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE
        SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED
        """
        new_filter = pe_filter(esfilter, row, depth)
        if new_filter is True:
            seq = list(sequence)
            seq.append(row)
            output.append(seq)
            return
        elif new_filter is False:
            return

        seq = list(sequence)
        seq.append(row)
        for d in primary_branch[depth]:
            main(seq, new_filter, d, depth + 1)

    # OUTPUT
    for i, d in enumerate(data):
        if is_data(d):
            main([], esfilter, wrap(d), 0)
        else:
            Log.error("filter is expecting a dict, not {{type}}",
                      type=d.__class__)

    # AT THIS POINT THE primary_column[] IS DETERMINED
    # USE IT TO EXPAND output TO ALL NESTED OBJECTS
    max = 0  # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max
    for i, n in enumerate(primary_nested):
        if n:
            max = i + 1

    # OUTPUT IS A LIST OF ROWS,
    # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY
    uniform_output = FlatList()

    def recurse(row, depth):
        if depth == max:
            uniform_output.append(row)
        else:
            nested = row[-1][primary_column[depth]]
            if not nested:
                # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN
                for i in range(depth, max):
                    row.append(None)
                uniform_output.append(row)
            else:
                for d in nested:
                    r = list(row)
                    r.append(d)
                    recurse(r, depth + 1)

    for o in output:
        recurse(o, 0)

    if not max:
        # SIMPLE LIST AS RESULT
        return wrap([unwrap(u[0]) for u in uniform_output])

    return PartFlatList(primary_column[0:max], uniform_output)
def extractor(
    guid,
    num_partitions,
    esq,
    query,
    selects,
    query_path,
    schema,
    chunk_size,
    cardinality,
    abs_limit,
    formatter,
    please_stop,
):
    total = 0
    # WE MESS WITH THE QUERY LIMITS FOR CHUNKING
    query.limit = first(query.groupby).domain.limit = chunk_size * 2
    start_time = Date.now()

    try:
        write_status(
            guid,
            {
                "status": "starting",
                "chunks": num_partitions,
                "rows": min(abs_limit, cardinality),
                "start_time": start_time,
                "timestamp": Date.now(),
            },
        )

        with TempFile() as temp_file:
            with open(temp_file.abspath, "wb") as output:
                for i in range(0, num_partitions):
                    if please_stop:
                        Log.error("request to shutdown!")
                    is_last = i == num_partitions - 1
                    first(query.groupby).allowNulls = is_last
                    acc, decoders, es_query = aggop_to_es_queries(
                        selects, query_path, schema, query)
                    # REACH INTO THE QUERY TO SET THE partitions
                    terms = es_query.aggs._filter.aggs._match.terms
                    terms.include.partition = i
                    terms.include.num_partitions = num_partitions

                    result = esq.es.search(deepcopy(es_query), query.limit)
                    aggs = unwrap(result.aggregations)

                    formatter.add(aggs, acc, query, decoders, selects)
                    for b in formatter.bytes():
                        if b is DONE:
                            break
                        output.write(b)
                    else:
                        write_status(
                            guid,
                            {
                                "status": "working",
                                "chunk": i,
                                "chunks": num_partitions,
                                "row": total,
                                "rows": min(abs_limit, cardinality),
                                "start_time": start_time,
                                "timestamp": Date.now(),
                            },
                        )
                        continue
                    break
                for b in formatter.footer():
                    output.write(b)

            upload(guid + ".json", temp_file)
        write_status(
            guid,
            {
                "ok": True,
                "status": "done",
                "chunks": num_partitions,
                "rows": min(abs_limit, cardinality),
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
    except Exception as e:
        e = Except.wrap(e)
        write_status(
            guid,
            {
                "ok": False,
                "status": "error",
                "error": e,
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
        Log.warning("Could not extract", cause=e)
Пример #60
0
 def add(self, val):
     key = value2key(self._keys, val)
     e = self._data.get(key, [])
     self._data[key] = e
     e.append(unwrap(val))
     self.count += 1