示例#1
0
文件: datas.py 项目: rv404674/TUID
    def __getitem__(self, key):
        if key == None:
            return Null
        if key == ".":
            output = self._internal_dict
            if isinstance(output, Mapping):
                return self
            else:
                return output

        key = text_type(key)
        d = self._internal_dict

        if key.find(".") >= 0:
            seq = _split_field(key)
            for n in seq:
                if isinstance(d, NullType):
                    d = NullType(d, n)  # OH DEAR, Null TREATS n AS PATH, NOT LITERAL
                elif isinstance(d, list):
                    d = [_getdefault(dd, n) for dd in d]
                else:
                    d = _getdefault(d, n)  # EVERYTHING ELSE TREATS n AS LITERAL

            return wrap(d)
        else:
            o = d.get(key)

        if o == None:
            return NullType(d, key)
        return wrap(o)
示例#2
0
文件: rename.py 项目: rv404674/TUID
 def convert(self, expr):
     """
     EXPAND INSTANCES OF name TO value
     """
     if expr is True or expr == None or expr is False:
         return expr
     elif Math.is_number(expr):
         return expr
     elif expr == ".":
         return "."
     elif is_variable_name(expr):
         return coalesce(self.dimensions[expr], expr)
     elif isinstance(expr, text_type):
         Log.error("{{name|quote}} is not a valid variable name", name=expr)
     elif isinstance(expr, Date):
         return expr
     elif isinstance(expr, QueryOp):
         return self._convert_query(expr)
     elif isinstance(expr, Mapping):
         if expr["from"]:
             return self._convert_query(expr)
         elif len(expr) >= 2:
             #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
             return wrap({name: self.convert(value) for name, value in expr.leaves()})
         else:
             # ASSUME SINGLE-CLAUSE EXPRESSION
             k, v = expr.items()[0]
             return converter_map.get(k, self._convert_bop)(self, k, v)
     elif isinstance(expr, (list, set, tuple)):
         return wrap([self.convert(value) for value in expr])
     else:
         return expr
示例#3
0
    def query(self, sql, param=None, stream=False, row_tuples=False):
        """
        RETURN LIST OF dicts
        """
        if not self.cursor:  # ALLOW NON-TRANSACTIONAL READS
            Log.error("must perform all queries inside a transaction")
        self._execute_backlog()

        try:
            if param:
                sql = expand_template(sql, quote_param(param))
            sql = self.preamble + outdent(sql)
            self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql))

            self.cursor.execute(sql)
            if row_tuples:
                if stream:
                    result = self.cursor
                else:
                    result = wrap(list(self.cursor))
            else:
                columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])]
                if stream:
                    result = (wrap({c: utf8_to_unicode(v) for c, v in zip(columns, row)}) for row in self.cursor)
                else:
                    result = wrap([{c: utf8_to_unicode(v) for c, v in zip(columns, row)} for row in self.cursor])

            return result
        except Exception as e:
            e = Except.wrap(e)
            if "InterfaceError" in e:
                Log.error("Did you close the db connection?", e)
            Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
示例#4
0
    def __init__(self, value):
        try:
            self.scheme = None
            self.host = None
            self.port = None
            self.path = ""
            self.query = ""
            self.fragment = ""

            if value == None:
                return

            if value.startswith("file://") or value.startswith("//"):
                # urlparse DOES NOT WORK IN THESE CASES
                scheme, suffix = value.split("//", 2)
                self.scheme = scheme.rstrip(":")
                parse(self, suffix, 0, 1)
                self.query = wrap(url_param2value(self.query))
            else:
                output = urlparse(value)
                self.scheme = output.scheme
                self.port = output.port
                self.host = output.netloc.split(":")[0]
                self.path = output.path
                self.query = wrap(url_param2value(output.query))
                self.fragment = output.fragment
        except Exception as e:
            Log.error("problem parsing {{value}} to URL", value=value, cause=e)
示例#5
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "range"
        self.NULL = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            parts =listwrap(self.partitions)
            for i, p in enumerate(parts):
                self.min = MIN([self.min, p.min])
                self.max = MAX([self.max, p.max])
                if p.dataIndex != None and p.dataIndex != i:
                    Log.error("Expecting `dataIndex` to agree with the order of the parts")
                if p[self.key] == None:
                    Log.error("Expecting all parts to have {{key}} as a property", key=self.key)
                p.dataIndex = i

            # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE
            for p, q in itertools.product(parts, parts):
                if p.min <= q.min and q.min < p.max and unwrap(p) is not unwrap(q):
                    Log.error("partitions overlap!")

            self.partitions = wrap(parts)
            return
        elif any([self.min == None, self.max == None, self.interval == None]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
示例#6
0
def filter(data, where):
    """
    where  - a function that accepts (record, rownum, rows) and returns boolean
    """
    if len(data) == 0 or where == None or where == TRUE:
        return data

    if isinstance(data, Container):
        return data.filter(where)

    if is_container(data):
        temp = jx_expression_to_function(where)
        dd = wrap(data)
        return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)])
    else:
        Log.error(
            "Do not know how to handle type {{type}}", type=data.__class__.__name__
        )

    try:
        return drill_filter(where, data)
    except Exception as _:
        # WOW!  THIS IS INEFFICIENT!
        return wrap(
            [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])]
        )
示例#7
0
    def __getitem__(self, key):
        if key == None:
            return Null
        if key == ".":
            output = _get(self, "_dict")
            if isinstance(output, Mapping):
                return self
            else:
                return output

        if isinstance(key, str):
            key = key.decode("utf8")
        elif not isinstance(key, unicode):
            get_logger().error("only string keys are supported")

        d = _get(self, "_dict")

        if key.find(".") >= 0:
            seq = _split_field(key)
            for n in seq:
                if isinstance(d, NullType):
                    d = NullType(d, n)  # OH DEAR, Null TREATS n AS PATH, NOT LITERAL
                elif isinstance(d, list):
                    d = [_getdefault(dd, n) for dd in d]
                else:
                    d = _getdefault(d, n)  # EVERYTHING ELSE TREATS n AS LITERAL

            return wrap(d)
        else:
            o = d.get(key)

        if o == None:
            return NullType(d, key)
        return wrap(o)
示例#8
0
def _select_a_field(field):
    if isinstance(field, basestring):
        return wrap({"name": field, "value": split_field(field)})
    elif isinstance(wrap(field).value, basestring):
        field = wrap(field)
        return wrap({"name": field.name, "value": split_field(field.value)})
    else:
        return wrap({"name": field.name, "value": field.value})
示例#9
0
def _select_a_field(field):
    if is_text(field):
        return wrap({"name": field, "value": split_field(field)})
    elif is_text(wrap(field).value):
        field = wrap(field)
        return wrap({"name": field.name, "value": split_field(field.value)})
    else:
        return wrap({"name": field.name, "value": field.value})
示例#10
0
 def search(self, query):
     query = wrap(query)
     f = jx.get(query.query.filtered.filter)
     filtered = wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
     if query.fields:
         return wrap({"hits": {"total": len(filtered), "hits": [{"_id": d._id, "fields": unwrap(jx.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})
     else:
         return wrap({"hits": {"total": len(filtered), "hits": filtered}})
示例#11
0
文件: jx.py 项目: rv404674/TUID
def run(query, container=Null):
    """
    THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER,
    BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer
    """
    if container == None:
        container = wrap(query)['from']
        query_op = QueryOp.wrap(query, container=container, namespace=container.schema)
    else:
        query_op = QueryOp.wrap(query, container, container.namespace)

    if container == None:
        from jx_python.containers.list_usingPythonList import DUAL
        return DUAL.query(query_op)
    elif isinstance(container, Container):
        return container.query(query_op)
    elif isinstance(container, (list, set) + generator_types):
        container = wrap(list(container))
    elif isinstance(container, Cube):
        if is_aggs(query_op):
            return cube_aggs(container, query_op)
    elif isinstance(container, QueryOp):
        container = run(container)
    else:
        Log.error("Do not know how to handle {{type}}", type=container.__class__.__name__)

    if is_aggs(query_op):
        container = list_aggs(container, query_op)
    else:  # SETOP
        if query_op.where is not TRUE:
            container = filter(container, query_op.where)

        if query_op.sort:
            container = sort(container, query_op.sort, already_normalized=True)

        if query_op.select:
            container = select(container, query_op.select)

    if query_op.window:
        if isinstance(container, Cube):
            container = list(container.values())

        for param in query_op.window:
            window(container, param)

    # AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT
    if query_op.format == "cube":
        container = convert.list2cube(container)
    elif query_op.format == "table":
        container = convert.list2table(container)
        container.meta.format = "table"
    else:
        container = wrap({
            "meta": {"format": "list"},
            "data": container
        })

    return container
示例#12
0
        def iter(data, depth):
            if depth == 0:
                for v in data:
                    yield wrap(v)
                return

            for v in data.values():
                for v1 in iter(v, depth - 1):
                    yield wrap(v1)
示例#13
0
def simplify_esfilter(esfilter):
    try:
        output = wrap(_normalize(wrap(esfilter)))
        output.isNormal = None
        return output
    except Exception as e:
        from mo_logs import Log

        Log.unexpected("programmer error", cause=e)
示例#14
0
def _normalize_group(edge, dim_index, schema=None):
    """
    :param edge: Not normalized groupby 
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if isinstance(edge, basestring):
        if edge.endswith(".*"):
            prefix = edge[:-1]
            if schema:
                output = wrap([
                    {
                        "name": literal_field(k),
                        "value": jx_expression(k),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for k, cs in schema.items()
                    if k.startswith(prefix)
                    for c in cs
                    if c.type not in STRUCT
                ])
                return output
            else:
                return wrap([{
                    "name": edge[:-2],
                    "value": jx_expression(edge[:-2]),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, basestring):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
 def __data__(self):
     if first(self.schema.columns).name=='.':
         return wrap({
             "meta": {"format": "list"},
             "data": self.data
         })
     else:
         return wrap({
             "meta": {"format": "list"},
             "data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data]
         })
示例#16
0
    def __init__(self, select, edges, data, frum=None):
        """
        data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE
        ALLOWED, USING THE select AND edges TO DESCRIBE THE data
        """

        self.is_value = False if is_list(select) else True
        self.select = select
        self.meta = Data(format="cube")       # PUT EXTRA MARKUP HERE
        self.is_none = False

        if not all(data.values()):
            is_none = True

        # ENSURE frum IS PROPER FORM
        if is_list(select):
            if edges and OR(not isinstance(v, Matrix) for v in data.values()):
                Log.error("Expecting data to be a dict with Matrix values")

        if not edges:
            if not data:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.ZERO}
                self.edges = FlatList.EMPTY
            elif is_data(data):
                # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA
                length = MAX([len(v) for v in data.values()])
                if length >= 1:
                    self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}])
                else:
                    self.edges = FlatList.EMPTY
            elif is_list(data):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.wrap(data)}
                self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}])
            elif isinstance(data, Matrix):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: data}
            else:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix(value=data)}
                self.edges = FlatList.EMPTY
        else:
            self.edges = wrap(edges)

        self.data = data
示例#17
0
文件: meta.py 项目: rv404674/TUID
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated >= Date.now()-TOO_OLD:
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
示例#18
0
def get_columns(data, leaves=False):
    # TODO Split this into two functions
    if not leaves:
        return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
    else:
        return wrap(
            [
                {"name": leaf}
                for leaf in set(leaf for row in data for leaf, _ in row.leaves())
            ]
        )
示例#19
0
文件: query.py 项目: rv404674/TUID
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if isinstance(edge, text_type):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([
                    {
                        "name": concat_field(prefix, literal_field(relative_field(untype_path(c.names["."]), prefix))),
                        "put": {"name": literal_field(untype_path(c.names["."]))},
                        "value": jx_expression(c.es_column, schema=schema),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for c in schema.leaves(prefix)
                ])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {"name": literal_field(untype_path(prefix))},
                    "value": jx_expression(prefix, schema=schema),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, text_type):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
示例#20
0
def list2tab(rows):
    columns = set()
    for r in wrap(rows):
        columns |= set(k for k, v in r.leaves())
    keys = list(columns)

    output = []
    for r in wrap(rows):
        output.append("\t".join(value2json(r[k]) for k in keys))

    return "\t".join(keys) + "\n" + "\n".join(output)
示例#21
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note("{{column.es_column}} does not exist", column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
示例#22
0
文件: decoders.py 项目: rv404674/TUID
    def append_query(self, es_query, start):
        self.start = start

        if not isinstance(self.edge.value, Variable):
            if self.exists is TRUE:
                # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH)
                output = wrap({"aggs": {
                    "_match": set_default(
                        {"terms": {
                            "script": self.script.expr,
                            "size": self.domain.limit,
                            "order": self.es_order
                        }},
                        es_query
                    )
                }})
            else:
                output = wrap({"aggs": {
                    "_match": {  # _match AND _filter REVERSED SO _match LINES UP WITH _missing
                        "filter": self.exists.to_esfilter(self.schema),
                        "aggs": {
                            "_filter": set_default(
                                {"terms": {
                                    "script": self.script.expr,
                                    "size": self.domain.limit,
                                    "order": self.es_order
                                }},
                                es_query
                            )
                        }
                    },
                    "_missing": set_default(
                        {"filter": self.missing.to_esfilter(self.schema)},
                        es_query
                    )
                }})
            return output
        else:
            output = wrap({"aggs": {
                "_match": set_default(
                    {"terms": {
                        "field": self.schema.leaves(self.edge.value.var)[0].es_column,
                        "size": self.domain.limit,
                        "order": self.es_order
                    }},
                    es_query
                ),
                "_missing": set_default(
                    {"filter": self.missing.to_esfilter(self.schema)},
                    es_query
                )
            }})
            return output
示例#23
0
    def __getitem__(self, item):
        # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN
        # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART
        # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]}
        # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING
        if is_data(item):
            coordinates = [None] * len(self.edges)

            # MAP DICT TO NUMERIC INDICES
            for name, v in item.items():
                ei, parts = wrap([(i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name])[0]
                if not parts:
                    Log.error("Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet",
                        name= name,
                        value= v)
                part = wrap([p for p in parts if p.value == v])[0]
                if not part:
                    return Null
                else:
                    coordinates[ei] = part.dataIndex

            edges = [e for e, v in zip(self.edges, coordinates) if v is None]
            if not edges:
                # ZERO DIMENSIONAL VALUE
                return wrap({k: v.__getitem__(coordinates) for k, v in self.data.items()})
            else:
                output = Cube(
                    select=self.select,
                    edges=wrap([e for e, v in zip(self.edges, coordinates) if v is None]),
                    data={k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items()}
                )
                return output
        elif is_text(item):
            # RETURN A VALUE CUBE
            if self.is_value:
                if item != self.select.name:
                    Log.error("{{name}} not found in cube",  name= item)
                return self

            if item not in self.select.name:
                Log.error("{{name}} not found in cube",  name= item)

            output = Cube(
                select=[s for s in self.select if s.name == item][0],
                edges=self.edges,
                data={item: self.data[item]}
            )
            return output
        else:
            Log.error("not implemented yet")
示例#24
0
 def __getitem__(self, key):
     try:
         _key = value2key(self._keys, key)
         if len(self._keys) == 1 or len(_key) == len(self._keys):
             d = self._data.get(_key)
             return wrap(d)
         else:
             output = wrap([
                 d
                 for d in self._data.values()
                 if all(wrap(d)[k] == v for k, v in _key.items())
             ])
             return output
     except Exception as e:
         Log.error("something went wrong", e)
示例#25
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = jx.run({
                "from": ListContainer(".", self.cluster.get_aliases()),
                "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}},
                "sort": "index"
            })
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            with suppress_exception:
                es.set_refresh_interval(seconds=60 * 5, timeout=5)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
示例#26
0
    def __getattribute__(self, key):
        if key == b"__class__":
            return NullType
        key = key.decode('utf8')

        d = _get(self, "__dict__")
        o = wrap(d["_obj"])
        k = d["__key__"]
        if o is None:
            return Null
        elif isinstance(o, NullType):
            return NullType(self, key)
        v = o.get(k)
        if v == None:
            return NullType(self, key)
        return wrap(v.get(key))
示例#27
0
def find_holes(db_module, db, table_name, column_name, _range, filter=None):
    """
    FIND HOLES IN A DENSE COLUMN OF INTEGERS
    RETURNS A LIST OF {"min"min, "max":max} OBJECTS
    """
    if not filter:
        filter = {"match_all": {}}

    _range = wrap(_range)
    params = {
        "min": _range.min,
        "max": _range.max - 1,
        "column_name": db_module.quote_column(column_name),
        "table_name": db_module.quote_column(table_name),
        "filter": esfilter2sqlwhere(filter)
    }

    min_max = db.query("""
        SELECT
            min({{column_name}}) `min`,
            max({{column_name}})+1 `max`
        FROM
            {{table_name}} a
        WHERE
            a.{{column_name}} BETWEEN {{min}} AND {{max}} AND
            {{filter}}
    """, params)[0]

    db.execute("SET @last={{min}}-1", {"min": _range.min})
    ranges = db.query("""
        SELECT
            prev_rev+1 `min`,
            curr_rev `max`
        FROM (
            SELECT
                a.{{column_name}}-@last diff,
                @last prev_rev,
                @last:=a.{{column_name}} curr_rev
            FROM
                {{table_name}} a
            WHERE
                a.{{column_name}} BETWEEN {{min}} AND {{max}} AND
                {{filter}}
            ORDER BY
                a.{{column_name}}
        ) a
        WHERE
            diff>1
    """, params)

    if ranges:
        ranges.append({"min": min_max.max, "max": _range.max})
    else:
        if min_max.min:
            ranges.append({"min": _range.min, "max": min_max.min})
            ranges.append({"min": min_max.max, "max": _range.max})
        else:
            ranges.append(_range)

    return ranges
示例#28
0
def _select_deep(v, field, depth, record):
    """
    field = {"name":name, "value":["attribute", "path"]}
    r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH
    """
    if hasattr(field.value, "__call__"):
        try:
            record[field.name] = field.value(wrap(v))
        except Exception as e:
            record[field.name] = None
        return 0, None

    for i, f in enumerate(field.value[depth : len(field.value) - 1 :]):
        v = v.get(f)
        if v is None:
            return 0, None
        if is_list(v):
            return depth + i + 1, v

    f = field.value.last()
    try:
        if not f:  # NO NAME FIELD INDICATES SELECT VALUE
            record[field.name] = v
        else:
            record[field.name] = v.get(f)
    except Exception as e:
        Log.error(
            "{{value}} does not have {{field}} property", value=v, field=f, cause=e
        )
    return 0, None
    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Till(seconds=1).wait()
                messages = wrap(self.queue.pop_all())
                if not messages:
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    try:
                        for i, message in enumerate(mm):
                            if message is THREAD_STOP:
                                please_stop.go()
                                return
                            scrubbed.append(_deep_json_to_string(message, depth=3))
                    finally:
                        self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index)
                Till(seconds=30).wait()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=1).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)
示例#30
0
def _expand(template, seq):
    """
    seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE
    """
    if is_text(template):
        return _simple_expand(template, seq)
    elif is_data(template):
        # EXPAND LISTS OF ITEMS USING THIS FORM
        # {"from":from, "template":template, "separator":separator}
        template = wrap(template)
        assert template["from"], "Expecting template to have 'from' attribute"
        assert template.template, "Expecting template to have 'template' attribute"

        data = seq[-1][template["from"]]
        output = []
        for d in data:
            s = seq + (d,)
            output.append(_expand(template.template, s))
        return coalesce(template.separator, "").join(output)
    elif is_list(template):
        return "".join(_expand(t, seq) for t in template)
    else:
        if not _Log:
            _late_import()

        _Log.error("can not handle")
示例#31
0
    def getDomain(self, **kwargs):
        # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS
        kwargs = wrap(kwargs)
        kwargs.depth = coalesce(
            kwargs.depth,
            len(self.fields) - 1 if isinstance(self.fields, list) else None)

        if not self.partitions and self.edges:
            # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP
            partitions = [
                {
                    "name": v.name,
                    "value": v.name,
                    "where": v.where,
                    "style": v.style,
                    "weight": v.weight  # YO! WHAT DO WE *NOT* COPY?
                } for i, v in enumerate(self.edges)
                if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where
            ]
            self.isFacet = True
        elif kwargs.depth == None:  # ASSUME self.fields IS A dict
            partitions = FlatList()
            for i, part in enumerate(self.partitions):
                if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT):
                    break
                partitions.append({
                    "name":
                    part.name,
                    "value":
                    part.value,
                    "where":
                    part.where,
                    "style":
                    coalesce(part.style, part.parent.style),
                    "weight":
                    part.weight  # YO!  WHAT DO WE *NOT* COPY?
                })
        elif kwargs.depth == 0:
            partitions = [
                {
                    "name": v.name,
                    "value": v.value,
                    "where": v.where,
                    "style": v.style,
                    "weight": v.weight  # YO!  WHAT DO WE *NOT* COPY?
                } for i, v in enumerate(self.partitions)
                if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)
            ]
        elif kwargs.depth == 1:
            partitions = FlatList()
            rownum = 0
            for i, part in enumerate(self.partitions):
                if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT):
                    continue
                rownum += 1
                try:
                    for j, subpart in enumerate(part.partitions):
                        partitions.append({
                            "name":
                            join_field(
                                split_field(subpart.parent.name) +
                                [subpart.name]),
                            "value":
                            subpart.value,
                            "where":
                            subpart.where,
                            "style":
                            coalesce(subpart.style, subpart.parent.style),
                            "weight":
                            subpart.weight  # YO!  WHAT DO WE *NOT* COPY?
                        })
                except Exception as e:
                    Log.error("", e)
        else:
            Log.error("deeper than 2 is not supported yet")

        return Domain(
            type=self.type,
            name=self.name,
            partitions=wrap(partitions),
            min=self.min,
            max=self.max,
            interval=self.interval,
            # THE COMPLICATION IS THAT SOMETIMES WE WANT SIMPLE PARTITIONS, LIKE
            # STRINGS, DATES, OR NUMBERS.  OTHER TIMES WE WANT PARTITION OBJECTS
            # WITH NAME, VALUE, AND OTHER MARKUP.
            # USUALLY A "set" IS MEANT TO BE SIMPLE, BUT THE end() FUNCTION IS
            # OVERRIDES EVERYTHING AND IS EXPLICIT.  - NOT A GOOD SOLUTION BECAUSE
            # end() IS USED BOTH TO INDICATE THE QUERY PARTITIONS *AND* DISPLAY
            # COORDINATES ON CHARTS

            # PLEASE SPLIT end() INTO value() (replacing the string value) AND
            # label() (for presentation)
            value="name" if not self.value and self.partitions else self.value,
            key="value",
            label=coalesce(self.label, (self.type == "set" and self.name)),
            end=coalesce(self.end, (self.type == "set" and self.name)),
            isFacet=self.isFacet,
            dimension=self)
示例#32
0
    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = wrap(command)
        schema = self._es.get_properties()

        # GET IDS OF DOCUMENTS
        results = self._es.search({
            "stored_fields":
            listwrap(schema._routing.path),
            "query": {
                "bool": {
                    "filter": jx_expression(command.where).to_esfilter(Null)
                }
            },
            "size":
            10000
        })

        # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
        scripts = FlatList()
        for k, v in command.set.items():
            if not is_variable_name(k):
                Log.error("Only support simple paths for now")
            if isinstance(v, Mapping) and v.doc:
                scripts.append({"doc": v.doc})
            else:
                v = scrub(v)
                scripts.append({
                    "script":
                    "ctx._source." + k + " = " +
                    jx_expression(v).to_painless(schema).script(schema)
                })

        if results.hits.hits:
            updates = []
            for h in results.hits.hits:
                for s in scripts:
                    updates.append({
                        "update": {
                            "_id":
                            h._id,
                            "_routing":
                            unwraplist(h.fields[literal_field(
                                schema._routing.path)])
                        }
                    })
                    updates.append(s)
            content = ("\n".join(convert.value2json(c)
                                 for c in updates) + "\n").encode('utf-8')
            response = self._es.cluster.post(
                self._es.path + "/_bulk",
                data=content,
                headers={"Content-Type": "application/json"},
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                })
            if response.errors:
                Log.error("could not update: {{error}}",
                          error=[
                              e.error for i in response["items"]
                              for e in i.values() if e.status not in (200, 201)
                          ])
示例#33
0
def find_holes(db_module, db, table_name, column_name, _range, filter=None):
    """
    FIND HOLES IN A DENSE COLUMN OF INTEGERS
    RETURNS A LIST OF {"min"min, "max":max} OBJECTS
    """
    if not filter:
        filter = {"match_all": {}}

    _range = wrap(_range)
    params = {
        "min": _range.min,
        "max": _range.max - 1,
        "column_name": db_module.quote_column(column_name),
        "table_name": db_module.quote_column(table_name),
        "filter": esfilter2sqlwhere(filter)
    }

    min_max = db.query(
        """
        SELECT
            min({{column_name}}) `min`,
            max({{column_name}})+1 `max`
        FROM
            {{table_name}} a
        WHERE
            a.{{column_name}} BETWEEN {{min}} AND {{max}} AND
            {{filter}}
    """, params)[0]

    db.execute("SET @last={{min}}-1", {"min": _range.min})
    ranges = db.query(
        """
        SELECT
            prev_rev+1 `min`,
            curr_rev `max`
        FROM (
            SELECT
                a.{{column_name}}-@last diff,
                @last prev_rev,
                @last:=a.{{column_name}} curr_rev
            FROM
                {{table_name}} a
            WHERE
                a.{{column_name}} BETWEEN {{min}} AND {{max}} AND
                {{filter}}
            ORDER BY
                a.{{column_name}}
        ) a
        WHERE
            diff>1
    """, params)

    if ranges:
        ranges.append({"min": min_max.max, "max": _range.max})
    else:
        if min_max.min:
            ranges.append({"min": _range.min, "max": min_max.min})
            ranges.append({"min": min_max.max, "max": _range.max})
        else:
            ranges.append(_range)

    return ranges
示例#34
0
        "file": (f[0] if f[0] != "~" else "").replace("\\", "/"),
        "line": f[1],
        "method": f[2].lstrip("<").rstrip(">")
    } for f, d, in acc.stats.iteritems()]
    stats_file = File(profile_settings.filename,
                      suffix=convert.datetime2string(datetime.now(),
                                                     "_%Y%m%d_%H%M%S"))
    stats_file.write(convert.list2tab(stats))


# GET THE MACHINE METADATA
machine_metadata = wrap({
    "pid":
    os.getpid(),
    "python":
    text_type(platform.python_implementation()),
    "os":
    text_type(platform.system() + platform.release()).strip(),
    "name":
    text_type(platform.node())
})


def raise_from_none(e):
    raise e


if PY3:
    exec("def raise_from_none(e):\n    raise e from None\n", globals(),
         locals())

from mo_logs.log_usingFile import StructuredLogger_usingFile
示例#35
0
def es_script(term):
    return wrap({"script": {"lang": "painless", "source": term}})
示例#36
0
    def append_query(self, es_query, start):
        self.start = start

        if not isinstance(self.edge.value, Variable):
            script_field = self.edge.value.to_ruby()
            missing = self.edge.value.missing()

            output = wrap({
                "aggs": {
                    "_match":
                    set_default(
                        {
                            "terms": {
                                "script_field": script_field,
                                "size": self.domain.limit,
                                "order": {
                                    "_term": self.sorted
                                } if self.sorted else None
                            }
                        }, es_query),
                    "_missing":
                    set_default({"filter": missing.to_esfilter()}, es_query)
                    if missing else None
                }
            })
            return output
        elif self.edge.value.var in [s.value.var for s in self.query.sort]:
            sort_dir = [
                s.sort for s in self.query.sort
                if s.value.var == self.edge.value.var
            ][0]
            output = wrap({
                "aggs": {
                    "_match":
                    set_default(
                        {
                            "terms": {
                                "field": self.edge.value.var,
                                "size": self.domain.limit,
                                "order": {
                                    "_term": "asc" if sort_dir == 1 else "desc"
                                }
                            }
                        }, es_query),
                    "_missing":
                    set_default(
                        {"missing": {
                            "field": self.edge.value
                        }}, es_query
                    )  # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER
                }
            })
            return output
        else:
            output = wrap({
                "aggs": {
                    "_match":
                    set_default(
                        {
                            "terms": {
                                "field": self.edge.value.var,
                                "size": self.domain.limit
                            }
                        }, es_query),
                    "_missing":
                    set_default(
                        {"missing": {
                            "field": self.edge.value
                        }}, es_query
                    )  # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER
                }
            })
            return output
示例#37
0
 def _output():
     for g, v in itertools.groupby(data, get_key):
         group = Data()
         for k, gg in zip(keys, g):
             group[k] = gg
         yield (group, wrap(list(v)))
示例#38
0
 def __iter__(self):
     return (wrap(d) for d in self.data)
示例#39
0
文件: http.py 项目: mars-f/ActiveData
def request(method, url, headers=None, zip=None, retry=None, **kwargs):
    """
    JUST LIKE requests.request() BUT WITH DEFAULT HEADERS AND FIXES
    DEMANDS data IS ONE OF:
    * A JSON-SERIALIZABLE STRUCTURE, OR
    * LIST OF JSON-SERIALIZABLE STRUCTURES, OR
    * None

    Parameters
     * zip - ZIP THE REQUEST BODY, IF BIG ENOUGH
     * json - JSON-SERIALIZABLE STRUCTURE
     * retry - {"times": x, "sleep": y} STRUCTURE

    THE BYTE_STRINGS (b"") ARE NECESSARY TO PREVENT httplib.py FROM **FREAKING OUT**
    IT APPEARS requests AND httplib.py SIMPLY CONCATENATE STRINGS BLINDLY, WHICH
    INCLUDES url AND headers
    """
    global _warning_sent
    global request_count

    if not _warning_sent and not default_headers:
        Log.warning(
            text_type(
                "The pyLibrary.env.http module was meant to add extra " +
                "default headers to all requests, specifically the 'Referer' "
                +
                "header with a URL to the project. Use the `pyLibrary.debug.constants.set()` "
                + "function to set `pyLibrary.env.http.default_headers`"))
    _warning_sent = True

    if is_list(url):
        # TRY MANY URLS
        failures = []
        for remaining, u in jx.countdown(url):
            try:
                response = request(method, u, retry=retry, **kwargs)
                if mo_math.round(response.status_code,
                                 decimal=-2) not in [400, 500]:
                    return response
                if not remaining:
                    return response
            except Exception as e:
                e = Except.wrap(e)
                failures.append(e)
        Log.error(u"Tried {{num}} urls", num=len(url), cause=failures)

    if 'session' in kwargs:
        session = kwargs['session']
        del kwargs['session']
        sess = Null
    else:
        sess = session = sessions.Session()

    with closing(sess):
        if PY2 and is_text(url):
            # httplib.py WILL **FREAK OUT** IF IT SEES ANY UNICODE
            url = url.encode('ascii')

        try:
            set_default(kwargs, {"zip": zip, "retry": retry}, DEFAULTS)
            _to_ascii_dict(kwargs)

            # HEADERS
            headers = kwargs['headers'] = unwrap(
                set_default(headers, session.headers, default_headers))
            _to_ascii_dict(headers)
            del kwargs['headers']

            # RETRY
            retry = wrap(kwargs['retry'])
            if isinstance(retry, Number):
                retry = set_default({"times": retry}, DEFAULTS['retry'])
            if isinstance(retry.sleep, Duration):
                retry.sleep = retry.sleep.seconds
            del kwargs['retry']

            # JSON
            if 'json' in kwargs:
                kwargs['data'] = value2json(kwargs['json']).encode('utf8')
                del kwargs['json']

            # ZIP
            set_default(headers, {'Accept-Encoding': 'compress, gzip'})

            if kwargs['zip'] and len(coalesce(kwargs.get('data'))) > 1000:
                compressed = convert.bytes2zip(kwargs['data'])
                headers['content-encoding'] = 'gzip'
                kwargs['data'] = compressed
            del kwargs['zip']
        except Exception as e:
            Log.error(u"Request setup failure on {{url}}", url=url, cause=e)

        errors = []
        for r in range(retry.times):
            if r:
                Till(seconds=retry.sleep).wait()

            try:
                DEBUG and Log.note(u"http {{method|upper}} to {{url}}",
                                   method=method,
                                   url=text_type(url))
                request_count += 1
                return session.request(method=method,
                                       headers=headers,
                                       url=str(url),
                                       **kwargs)
            except Exception as e:
                e = Except.wrap(e)
                if retry['http'] and str(url).startswith(
                        "https://"
                ) and "EOF occurred in violation of protocol" in e:
                    url = URL("http://" + str(url)[8:])
                    Log.note(
                        "Changed {{url}} to http due to SSL EOF violation.",
                        url=str(url))
                errors.append(e)

        if " Read timed out." in errors[0]:
            Log.error(
                u"Tried {{times}} times: Timeout failure (timeout was {{timeout}}",
                timeout=kwargs['timeout'],
                times=retry.times,
                cause=errors[0])
        else:
            Log.error(u"Tried {{times}} times: Request failure of {{url}}",
                      url=url,
                      times=retry.times,
                      cause=errors[0])
示例#40
0
def es_setop(es, mvel, query):
    FromES = es09.util.build_es_query(query)
    select = listwrap(query.select)

    isDeep = len(split_field(
        query.frum.name)) > 1  # LOOKING INTO NESTED WILL REQUIRE A SCRIPT
    isComplex = OR([
        s.value == None and s.aggregate not in ("count", "none")
        for s in select
    ])  # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT

    if not isDeep and not isComplex:
        if len(select) == 1 and not select[0].value or select[0].value == "*":
            FromES = wrap({
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter":
                        simplify_esfilter(
                            jx_expression(query.where).to_esfilter())
                    }
                },
                "sort": query.sort,
                "size": 1
            })
        elif all(isinstance(v, Variable) for v in select.value):
            FromES = wrap({
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": simplify_esfilter(query.where.to_esfilter())
                    }
                },
                "fields": select.value,
                "sort": query.sort,
                "size": coalesce(query.limit, 200000)
            })
    elif not isDeep:
        simple_query = query.copy()
        simple_query.where = TRUE_FILTER  # THE FACET FILTER IS FASTER
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(simple_query),
                "size": coalesce(simple_query.limit, 200000)
            },
            "facet_filter":
            simplify_esfilter(jx_expression(query.where).to_esfilter())
        }
    else:
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(query),
                "size": coalesce(query.limit, 200000)
            },
            "facet_filter":
            simplify_esfilter(jx_expression(query.where).to_esfilter())
        }

    data = es09.util.post(es, FromES, query.limit)

    if len(select) == 1 and not select[0].value or select[0].value == "*":
        # SPECIAL CASE FOR SINGLE COUNT
        cube = wrap(data).hits.hits._source
    elif isinstance(select[0].value, Variable):
        # SPECIAL CASE FOR SINGLE TERM
        cube = wrap(data).hits.hits.fields
    else:
        data_list = unpack_terms(data.facets.mvel, select)
        if not data_list:
            cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select})
        else:
            output = zip(*data_list)
            cube = Cube(
                select, [],
                {s.name: Matrix(list=output[i])
                 for i, s in enumerate(select)})

    return Data(meta={"esquery": FromES}, data=cube)
示例#41
0
 def iteritems(self):
     # LOW LEVEL ITERATION, NO WRAPPING
     d = self._internal_dict
     return ((k, wrap(v)) for k, v in iteritems(d))
示例#42
0
 def items(self):
     d = self._internal_dict
     return [(k, wrap(v)) for k, v in d.items() if v != None or isinstance(v, Mapping)]
示例#43
0
文件: jx.py 项目: maggienj/ActiveData
    def pe_filter(filter, data, depth):
        """
        PARTIAL EVALUATE THE filter BASED ON data GIVEN
        """
        if filter is TRUE_FILTER:
            return True
        if filter is FALSE_FILTER:
            return False

        filter = wrap(filter)

        if filter["and"]:
            result = True
            output = FlatList()
            for a in filter[u"and"]:
                f = pe_filter(a, data, depth)
                if f is False:
                    result = False
                elif f is not True:
                    output.append(f)
            if result and output:
                return {"and": output}
            else:
                return result
        elif filter["or"]:
            output = FlatList()
            for o in filter[u"or"]:
                f = pe_filter(o, data, depth)
                if f is True:
                    return True
                elif f is not False:
                    output.append(f)
            if output:
                return {"or": output}
            else:
                return False
        elif filter["not"]:
            f = pe_filter(filter["not"], data, depth)
            if f is True:
                return False
            elif f is False:
                return True
            else:
                return {"not": f}
        elif filter.term or filter.eq:
            eq = coalesce(filter.term, filter.eq)
            result = True
            output = {}
            for col, val in eq.items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d != val:
                        result = False
                else:
                    output[rest] = val

            if result and output:
                return {"term": output}
            else:
                return result
        elif filter.equal:
            a, b = filter["equal"]
            first_a, rest_a = parse_field(a, data, depth)
            first_b, rest_b = parse_field(b, data, depth)
            val_a = data[first_a]
            val_b = data[first_b]
            if not rest_a:
                if not rest_b:
                    if val_a != val_b:
                        return False
                    else:
                        return True
                else:
                    return {"term": {rest_b: val_a}}
            else:
                if not rest_b:
                    return {"term": {rest_a: val_b}}
                else:
                    return {"equal": [rest_a, rest_b]}

        elif filter.terms:
            result = True
            output = {}
            for col, vals in filter["terms"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d not in vals:
                        result = False
                else:
                    output[rest] = vals
            if result and output:
                return {"terms": output}
            else:
                return result

        elif filter.range:
            result = True
            output = {}
            for col, ranges in filter["range"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    for sign, val in ranges.items():
                        if sign in ("gt", ">") and d <= val:
                            result = False
                        if sign == "gte" and d < val:
                            result = False
                        if sign == "lte" and d > val:
                            result = False
                        if sign == "lt" and d >= val:
                            result = False
                else:
                    output[rest] = ranges
            if result and output:
                return {"range": output}
            else:
                return result
        elif filter.missing:
            if isinstance(filter.missing, basestring):
                field = filter["missing"]
            else:
                field = filter["missing"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d == None:
                    return True
                return False
            else:
                return {"missing": rest}
        elif filter.prefix:
            result = True
            output = {}
            for col, val in filter["prefix"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d == None or not d.startswith(val):
                        result = False
                else:
                    output[rest] = val
            if result and output:
                return {"prefix": output}
            else:
                return result

        elif filter.exists:
            if isinstance(filter["exists"], basestring):
                field = filter["exists"]
            else:
                field = filter["exists"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d != None:
                    return True
                return False
            else:
                return {"exists": rest}
        else:
            Log.error(u"Can not interpret esfilter: {{esfilter}}",
                      {u"esfilter": filter})
示例#44
0
 def __iter__(self):
     temp = [wrap(v) for v in _get_list(self)]
     return iter(temp)
示例#45
0
 def filter(self, _filter):
     return FlatList(vals=[
         unwrap(u) for u in (wrap(v) for v in _get_list(self)) if _filter(u)
     ])
示例#46
0
 def selector(d):
     output = Data()
     for n, p in push_and_pull:
         output[n] = unwraplist(p(wrap(d)))
     return unwrap(output)
示例#47
0
    def append_query(self, es_query, start):
        self.start = start
        domain = self.domain
        field = self.edge.value

        if isinstance(field, Variable):
            key = domain.key
            if isinstance(key, (tuple, list)) and len(key) == 1:
                key = key[0]
            include = [p[key] for p in domain.partitions]

            if self.edge.allowNulls:

                return wrap({
                    "aggs": {
                        "_match":
                        set_default(
                            {
                                "terms": {
                                    "field": field.var,
                                    "size": self.limit,
                                    "include": include,
                                    "order": {
                                        "_term": self.sorted
                                    } if self.sorted else None
                                }
                            }, es_query),
                        "_missing":
                        set_default(
                            {
                                "filter": {
                                    "or": [
                                        field.missing().to_esfilter(), {
                                            "not": {
                                                "terms": {
                                                    field.var: include
                                                }
                                            }
                                        }
                                    ]
                                }
                            }, es_query),
                    }
                })
            else:
                return wrap({
                    "aggs": {
                        "_match":
                        set_default(
                            {
                                "terms": {
                                    "field": field.var,
                                    "size": self.limit,
                                    "include": include,
                                    "order": {
                                        "_term": self.sorted
                                    } if self.sorted else None
                                }
                            }, es_query)
                    }
                })
        else:
            include = [p[domain.key] for p in domain.partitions]
            if self.edge.allowNulls:

                return wrap({
                    "aggs": {
                        "_match":
                        set_default(
                            {
                                "terms": {
                                    "script_field": field.to_ruby(),
                                    "size": self.limit,
                                    "include": include
                                }
                            }, es_query),
                        "_missing":
                        set_default(
                            {
                                "filter": {
                                    "or": [
                                        field.missing().to_esfilter(),
                                        NotOp(
                                            "not",
                                            InOp("in", [
                                                field,
                                                Literal("literal", include)
                                            ])).to_esfilter()
                                    ]
                                }
                            }, es_query),
                    }
                })
            else:
                return wrap({
                    "aggs": {
                        "_match":
                        set_default(
                            {
                                "terms": {
                                    "script_field": field.to_ruby(),
                                    "size": self.limit,
                                    "include": include
                                }
                            }, es_query)
                    }
                })
示例#48
0
from mo_dots import wrap, Data, listwrap, is_data, FlatList
from mo_future import first
from mo_kwargs import override
from mo_logs import Log
from pyLibrary.sql import SQL_UPDATE, SQL_SET
from pyLibrary.sql.sqlite import sql_query, sql_create, sql_insert, quote_column, sql_eq, Sqlite

ROOT_USER = wrap({"_id": 1})
VERSION_TABLE = "security.version"
GROUP_TABLE = "security.groups"
PERMISSION_TABLE = "security.permissions"
RESOURCE_TABLE = "security.resources"
TABLE_OPERATIONS = ["insert", "update", "from"]
CREATE_TABLE = {"_id": 100, "table": ".", "operation": "insert", "owner": 1}


class Permissions:
    @override
    def __init__(self, db, kwargs):
        if is_data(db):
            self.db = Sqlite(db)
        elif isinstance(db, Sqlite):
            self.db = db
        else:
            Log.error("Bad db parameter")

        if not self.db.about(PERMISSION_TABLE):
            self.setup()
        self.next_id = id_generator(self.db)

    def setup(self):
示例#49
0
    def start(cls, settings=None):
        """
        RUN ME FIRST TO SETUP THE THREADED LOGGING
        http://victorlin.me/2012/08/good-logging-practice-in-python/

        log       - LIST OF PARAMETERS FOR LOGGER(S)
        trace     - SHOW MORE DETAILS IN EVERY LOG LINE (default False)
        cprofile  - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False)
                    USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"}
        profile   - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):)
                    USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"}
        constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE)
        """
        global _Thread

        if not settings:
            return
        settings = wrap(settings)

        Log.stop()

        cls.settings = settings
        cls.trace = coalesce(settings.trace, False)
        if cls.trace:
            from mo_threads import Thread as _Thread
            _ = _Thread

        if settings.cprofile is False:
            settings.cprofile = {"enabled": False}
        elif settings.cprofile is True or (isinstance(
                settings.cprofile, Mapping) and settings.cprofile.enabled):
            if isinstance(settings.cprofile, bool):
                settings.cprofile = {
                    "enabled": True,
                    "filename": "cprofile.tab"
                }

            import cProfile

            cls.cprofiler = cProfile.Profile()
            cls.cprofiler.enable()

        if settings.profile is True or (isinstance(settings.profile, Mapping)
                                        and settings.profile.enabled):
            from mo_logs import profiles

            if isinstance(settings.profile, bool):
                profiles.ON = True
                settings.profile = {"enabled": True, "filename": "profile.tab"}

            if settings.profile.enabled:
                profiles.ON = True

        if settings.constants:
            constants.set(settings.constants)

        if settings.log:
            cls.logging_multi = StructuredLogger_usingMulti()
            from mo_logs.log_usingThread import StructuredLogger_usingThread
            cls.main_log = StructuredLogger_usingThread(cls.logging_multi)

            for log in listwrap(settings.log):
                Log.add_log(Log.new_instance(log))

        if settings.cprofile.enabled == True:
            Log.alert("cprofiling is enabled, writing to {{filename}}",
                      filename=os.path.abspath(settings.cprofile.filename))
示例#50
0
    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski ([email protected])
#

from __future__ import absolute_import, division, unicode_literals

from jx_base.expressions import NULL
from mo_dots import wrap
from tests.test_jx import BaseTestCase, TEST_TABLE

lots_of_data = wrap([{"a": i} for i in range(30)])


class TestSetOps(BaseTestCase):

    def test_length(self):
        test = {
            "data": [
                {"v": "1"},
                {"v": "22"},
                {"v": "333"},
                {"v": "4444"},
                {"v": "55555"}
            ],
            "query": {
                "from": TEST_TABLE,
示例#52
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not startswith_field(query['from'], self.name):
            Log.error("Expecting table, or some nested table")
        frum, query['from'] = query['from'], self
        query = QueryOp.wrap(query, self.columns)

        # TYPE CONFLICTS MUST NOW BE RESOLVED DURING
        # TYPE-SPECIFIC QUERY NORMALIZATION
        # vars_ = query.vars(exclude_select=True)
        # type_map = {
        #     v: c.es_column
        #     for v in vars_
        #     if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1
        #     for c in self.columns[v]
        #     if c.type != "nested"
        # }
        #
        # sql_query = query.map(type_map)
        query = query

        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = "CREATE TABLE " + quote_table(new_table) + " AS "
        else:
            create_table = ""

        if query.groupby:
            op, index_to_columns = self._groupby_op(query, frum)
            command = create_table + op
        elif query.edges or any(a != "none"
                                for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
        else:
            op = self._set_op(query, frum)
            return op

        if query.sort:
            command += "\nORDER BY " + ",\n".join(
                "(" + sql[t] + ") IS NULL" +
                (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] +
                (" DESC" if s.sort == -1 else "")
                for s, sql in [(s, s.value.to_sql(self)[0].sql)
                               for s in query.sort] for t in "bns" if sql[t])

        result = self.db.query(command)

        column_names = query.edges.name + query.groupby.name + listwrap(
            query.select).name
        if query.format == "container":
            output = QueryTable(new_table,
                                db=self.db,
                                uid=self.uid,
                                exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(
                        s.pull(result.data[0]))
                return Data(data=unwrap(data), meta={"format": "cube"})

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(
                            partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif isinstance(e.value, TupleOp):
                        pulls = jx.sort([
                            c for c in index_to_columns.values()
                            if c.push_name == e.name
                        ], "push_child").pull
                        parts = [
                            tuple(p(d) for p in pulls) for d in result.data
                        ]
                        domain = SimpleSetDomain(
                            partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(
                        Data(name=e.name, allowNulls=allowNulls,
                             domain=domain))

                zeros = [
                    0 if s.aggregate == "count"
                    and index_to_columns[si].push_child == "." else Data
                    for si, s in enumerate(listwrap(query.select))
                ]
                data = {
                    s.name: Matrix(dims=dims, zeros=zeros[si])
                    for si, s in enumerate(listwrap(query.select))
                }

                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(meta={"format": "cube"},
                            edges=edges,
                            select=select,
                            data={k: v.cube
                                  for k, v in data.items()})

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(
                        partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif isinstance(e.value, TupleOp):
                    pulls = jx.sort([
                        c for c in index_to_columns.values()
                        if c.push_name == e.name
                    ], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = zip(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}
                    domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(
                    Data(name=e.name, allowNulls=allowNulls, domain=domain))

            zeros = [
                0 if s.aggregate == "count"
                and index_to_columns[si].push_child == "." else Data
                for si, s in enumerate(listwrap(query.select))
            ]
            data_cubes = {
                s.name: Matrix(dims=dims, zeros=zeros[si])
                for si, s in enumerate(listwrap(query.select))
            }
            r2c = index_to_coordinate(
                dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(
                            row)

            if isinstance(query.select, list):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(meta={"format": "cube"},
                        edges=edges,
                        select=select,
                        data={k: v.cube
                              for k, v in data_cubes.items()})
        elif query.format == "table" or (not query.format and query.groupby):
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[
                                s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(meta={"format": "table"},
                          header=column_names,
                          data=data)
        elif query.format == "list" or (not query.edges and not query.groupby):

            if not query.edges and not query.groupby and any(
                    listwrap(query.select).aggregate):
                if isinstance(query.select, list):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            data[c.push_name] = c.pull(result.data[0])
                        else:
                            data[c.push_name][c.push_child] = c.pull(
                                result.data[0])

                    output = Data(meta={"format": "value"}, data=data)
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        data[s.push_child] = s.pull(result.data[0])

                    output = Data(meta={"format": "value"}, data=unwrap(data))
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[
                                    c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(meta={"format": "list"}, data=data)
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
示例#53
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c for c in self.meta.columns
                        if ((c.last_updated < Date.now() -
                             MAX_COLUMN_METADATA_AGE) or c.cardinality == None)
                        and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[
                                Date(t).format()
                                for t in wrap(old_columns).last_updated
                            ])
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}",
                               param={
                                   "table": column.es_index,
                                   "column": column.es_column
                               },
                               silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note(
                                "{{column.es_column}} does not exist",
                                column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_index": column.es_index
                                    }
                                }
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(
                                column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note(
                                "{{column.es_column}} is a struct",
                                column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now(
                        ) - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note(
                                "{{column.es_column}} is still fresh ({{ago}} ago)",
                                column=column,
                                ago=(Date.now() -
                                     Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and
                             not column.es_index.startswith(TEST_TABLE_PREFIX)
                             ) and Log.note("updated {{column.name}}",
                                            column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {
                                        "eq": {
                                            "es_index": column.es_index,
                                            "es_column": column.es_column
                                        }
                                    }
                                })
                            else:
                                Log.warning(
                                    "problem getting cardinality for {{column.name}}",
                                    column=column,
                                    cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
示例#54
0
    def __init__(self, dim, parent, jx):
        dim = wrap(dim)

        self.name = dim.name
        self.parent = coalesce(parent)
        self.full_name = join_field(
            split_field(self.parent.full_name) + [self.name])
        self.edges = None  # FOR NOW
        dot.set_default(self, dim)
        self.where = dim.where
        self.type = coalesce(dim.type, "set")
        self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT)
        self.index = coalesce(dim.index,
                              coalesce(parent, Null).index, jx.settings.index)

        if not self.index:
            Log.error("Expecting an index name")

        # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION)
        self.edges = Data()
        for e in listwrap(dim.edges):
            new_e = Dimension(e, self, jx)
            self.edges[new_e.full_name] = new_e

        self.partitions = wrap(coalesce(dim.partitions, []))
        parse_partition(self)

        fields = coalesce(dim.field, dim.fields)
        if not fields:
            return  # NO FIELDS TO SEARCH
        elif isinstance(fields, Mapping):
            self.fields = wrap(fields)
            edges = wrap([{
                "name": k,
                "value": v,
                "allowNulls": False
            } for k, v in self.fields.items()])
        else:
            self.fields = listwrap(fields)
            edges = wrap([{
                "name": f,
                "value": f,
                "index": i,
                "allowNulls": False
            } for i, f in enumerate(self.fields)])

        if dim.partitions:
            return  # ALREADY HAVE PARTS
        if self.type not in KNOWN - ALGEBRAIC:
            return  # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH

        jx.get_columns()
        with Timer("Get parts of {{name}}", {"name": self.name}):
            parts = jx.query({
                "from": self.index,
                "select": {
                    "name": "count",
                    "aggregate": "count"
                },
                "edges": edges,
                "where": self.where,
                "limit": self.limit
            })
            Log.note("{{name}} has {{num}} parts",
                     name=self.name,
                     num=len(parts))

        d = parts.edges[0].domain

        if dim.path:
            if len(edges) > 1:
                Log.error("Not supported yet")
            # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE
            temp = Data(partitions=[])
            for i, count in enumerate(parts):
                a = dim.path(d.getEnd(d.partitions[i]))
                if not isinstance(a, list):
                    Log.error("The path function on " + dim.name +
                              " must return an ARRAY of parts")
                addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0)
            self.value = coalesce(dim.value, "name")
            self.partitions = temp.partitions
        elif isinstance(fields, Mapping):
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            partitions = FlatList()
            for g, p in parts.groupby(edges):
                if p:
                    partitions.append({
                        "value": g,
                        "where": {
                            "and": [{
                                "term": {
                                    e.value: g[e.name]
                                }
                            } for e in edges]
                        },
                        "count": int(p)
                    })
            self.partitions = partitions
        elif len(edges) == 1:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            self.partitions = wrap([
                {
                    "name": str(d.partitions[i].name),  # CONVERT TO STRING
                    "value": d.getEnd(d.partitions[i]),
                    "where": {
                        "term": {
                            edges[0].value: d.partitions[i].value
                        }
                    },
                    "count": count
                } for i, count in enumerate(parts)
            ])
            self.order = {p.value: i for i, p in enumerate(self.partitions)}
        elif len(edges) == 2:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS
            d2 = parts.edges[1].domain

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            array = parts.data.values(
            )[0].cube  # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END)

            def edges2value(*values):
                if isinstance(fields, Mapping):
                    output = Data()
                    for e, v in transpose(edges, values):
                        output[e.name] = v
                    return output
                else:
                    return tuple(values)

            self.partitions = wrap([
                {
                    "name":
                    str(d.partitions[i].name),  # CONVERT TO STRING
                    "value":
                    d.getEnd(d.partitions[i]),
                    "where": {
                        "term": {
                            edges[0].value: d.partitions[i].value
                        }
                    },
                    "count":
                    SUM(subcube),
                    "partitions": [
                        {
                            "name":
                            str(d2.partitions[j].name),  # CONVERT TO STRING
                            "value":
                            edges2value(d.getEnd(d.partitions[i]),
                                        d2.getEnd(d2.partitions[j])),
                            "where": {
                                "and": [{
                                    "term": {
                                        edges[0].value: d.partitions[i].value
                                    }
                                }, {
                                    "term": {
                                        edges[1].value: d2.partitions[j].value
                                    }
                                }]
                            },
                            "count":
                            count2
                        } for j, count2 in enumerate(subcube)
                        if count2 > 0  # ONLY INCLUDE PROPERTIES THAT EXIST
                    ]
                } for i, subcube in enumerate(array)
            ])
        else:
            Log.error("Not supported")

        parse_partition(self)  # RELATE THE PARTS TO THE PARENTS
示例#55
0
def json2value(json_string, params=Null, flexible=False, leaves=False):
    """
    :param json_string: THE JSON
    :param params: STANDARD JSON PARAMS
    :param flexible: REMOVE COMMENTS
    :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED
    :return: Python value
    """
    if not isinstance(json_string, text_type):
        Log.error("only unicode json accepted")

    try:
        if flexible:
            # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r
            # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58
            json_string = re.sub(r"\"\"\".*?\"\"\"",
                                 r"\n",
                                 json_string,
                                 flags=re.MULTILINE)
            json_string = "\n".join(
                remove_line_comment(l) for l in json_string.split("\n"))
            # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA
            json_string = re.sub(r",\s*\}", r"}", json_string)
            # ALLOW LISTS TO END WITH COMMA
            json_string = re.sub(r",\s*\]", r"]", json_string)

        if params:
            # LOOKUP REFERENCES
            json_string = expand_template(json_string, params)

        try:
            value = wrap(json_decoder(text_type(json_string)))
        except Exception as e:
            Log.error("can not decode\n{{content}}",
                      content=json_string,
                      cause=e)

        if leaves:
            value = wrap_leaves(value)

        return value

    except Exception as e:
        e = Except.wrap(e)

        if not json_string.strip():
            Log.error("JSON string is only whitespace")

        c = e
        while "Expecting '" in c.cause and "' delimiter: line" in c.cause:
            c = c.cause

        if "Expecting '" in c and "' delimiter: line" in c:
            line_index = int(strings.between(c.message, " line ",
                                             " column ")) - 1
            column = int(strings.between(c.message, " column ", " ")) - 1
            line = json_string.split("\n")[line_index].replace("\t", " ")
            if column > 20:
                sample = "..." + line[column - 20:]
                pointer = "   " + (" " * 20) + "^"
            else:
                sample = line
                pointer = (" " * column) + "^"

            if len(sample) > 43:
                sample = sample[:43] + "..."

            Log.error(CAN_NOT_DECODE_JSON +
                      " at:\n\t{{sample}}\n\t{{pointer}}\n",
                      sample=sample,
                      pointer=pointer)

        base_str = strings.limit(json_string, 1000).encode('utf8')
        hexx_str = bytes2hex(base_str, " ")
        try:
            char_str = " " + "  ".join(
                (c.decode("latin1") if ord(c) >= 32 else ".")
                for c in base_str)
        except Exception:
            char_str = " "
        Log.error(CAN_NOT_DECODE_JSON + ":\n{{char_str}}\n{{hexx_str}}\n",
                  char_str=char_str,
                  hexx_str=hexx_str,
                  cause=e)
示例#56
0
 def pop(self, index=None):
     if index is None:
         return wrap(_get_list(self).pop())
     else:
         return wrap(_get_list(self).pop(index))
示例#57
0
 def to_sql(self, schema, not_null=False, boolean=False):
     return wrap([{
         "name": ".",
         "sql": SQLang[t].to_sql(schema)[0].sql
     } for t in self.terms])
示例#58
0
 def __deepcopy__(self, memo):
     d = _get_list(self)
     return wrap(deepcopy(d, memo))
示例#59
0
    def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None):
        """
        :param keys: THE KEYS TO LOAD FROM source
        :param source: THE SOURCE (USUALLY S3 BUCKET)
        :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING
        :param sample_size: FOR RANDOM SAMPLE OF THE source DATA
        :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION
        :return: LIST OF SUB-keys PUSHED INTO ES
        """
        num_keys = 0
        queue = None
        pending = []  # FOR WHEN WE DO NOT HAVE QUEUE YET
        for key in keys:
            timer = Timer("Process {{key}}", param={"key": key}, silent=not DEBUG)
            try:
                with timer:
                    for rownum, line in enumerate(source.read_lines(strip_extension(key))):
                        if not line:
                            continue

                        if rownum > 0 and rownum % 1000 == 0:
                            Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name)

                        insert_me, please_stop = fix(key, rownum, line, source, sample_only_filter, sample_size)
                        if insert_me == None:
                            continue
                        value = insert_me['value']

                        if '_id' not in value:
                            Log.warning("expecting an _id in all S3 records. If missing, there can be duplicates")

                        if queue == None:
                            queue = self._get_queue(insert_me)
                            if queue == None:
                                pending.append(insert_me)
                                if len(pending) > 1000:
                                    if done_copy:
                                        done_copy()
                                    Log.error("first 1000 (key={{key}}) records for {{alias}} have no indication what index to put data", key=tuple(keys)[0], alias=self.settings.index)
                                continue
                            elif queue is DATA_TOO_OLD:
                                break
                            if pending:
                                queue.extend(pending)
                                pending = []

                        num_keys += 1
                        queue.add(insert_me)

                        if please_stop:
                            break
            except Exception as e:
                if KEY_IS_WRONG_FORMAT in e:
                    Log.warning("Could not process {{key}} because bad format. Never trying again.", key=key, cause=e)
                    pass
                elif CAN_NOT_DECODE_JSON in e:
                    Log.warning("Could not process {{key}} because of bad JSON. Never trying again.", key=key, cause=e)
                    pass
                else:
                    Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e)
                    done_copy = None

        if done_copy:
            if queue == None:
                done_copy()
            elif queue is DATA_TOO_OLD:
                done_copy()
            else:
                queue.add(done_copy)

        if [p for p in pending if wrap(p).value.task.state not in ('failed', 'exception')]:
            Log.error("Did not find an index for {{alias}} to place the data for key={{key}}", key=tuple(keys)[0], alias=self.settings.index)

        Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys)
        return num_keys
示例#60
0
    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | set(listwrap(command['clear']))
        for c in self.get_leaves():
            if c.name in touched_columns and c.nested_path and len(c.name) > len(c.nested_path[0]):
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where)
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars
            for c in self.columns.get(v, Null)
            if c.type not in STRUCT
        }
        where_sql = where.map(_map).to_sql()
        new_columns = set(command.set.keys()) - set(self.columns.keys())
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_type(nested_value)
            column = Column(
                names={".": new_column_name},
                type=ctype,
                es_index=self.sf.fact,
                es_column=typed_column(new_column_name, ctype)
            )
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_type(nested_value) == "nested":
                nested_table_name = concat_field(self.sf.fact, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(quote_column(c.es_column) for u in self.uid for c in self.columns[u])
                extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid))
                extra_key = [e for e in nested_table.columns[extra_key_name]][0]

                sql_command = (
                    "DELETE" + SQL_FROM + quote_column(nested_table.name) +
                    SQL_WHERE + "EXISTS (" +
                    "\nSELECT 1 " +
                    SQL_FROM + quote_column(nested_table.name) + " n" +
                    SQL_INNER_JOIN + "(" +
                    SQL_SELECT + self_primary_key +
                    SQL_FROM + quote_column(self.sf.fact) +
                    SQL_WHERE + where_sql +
                    "\n) t ON " +
                    SQL_AND.join(
                        "t." + quote_column(c.es_column) + " = n." + quote_column(c.es_column)
                        for u in self.uid
                        for c in self.columns[u]
                    ) +
                    ")"
                )
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d, Data(), doc_collection, path=nested_column_name)

                prefix = "INSERT INTO " + quote_column(nested_table.name) + sql_iso(sql_list(
                    [self_primary_key] +
                    [quote_column(extra_key)] +
                    [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]
                ))

                # BUILD THE PARENT TABLES
                parent = (
                    SQL_SELECT + self_primary_key +
                    SQL_FROM + quote_column(self.sf.fact) +
                    SQL_WHERE + jx_expression(command.where).to_sql()
                )

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT +
                    quote_value(i) + " " + quote_column(extra_key.es_column) + "," +
                    sql_list(
                        quote_value(row[c.name]) + " " + quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    )
                    for i, row in enumerate(doc_collection.get(".", Null).rows)
                )

                sql_command = (
                    prefix +
                    SQL_SELECT +
                    sql_list(
                        [join_column("p", c.es_column) for u in self.uid for c in self.columns[u]] +
                        [join_column("c", extra_key)] +
                        [join_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns]
                    ) +
                    SQL_FROM + sql_iso(parent) + " p" +
                    SQL_INNER_JOIN + sql_iso(children) + " c" + " ON " + SQL_TRUE
                )

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(
                            names={".": c.name},
                            type=c.type,
                            es_index=c.es_index,
                            es_column=c.es_column,
                            nested_path=[nested_column_name] + c.nested_path
                        )
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.type not in [c.type for c in self.columns[c.name]]:
                            self.columns[column.name].add(column)

        command = (
            "UPDATE " + quote_column(self.sf.fact) + " SET " +
            sql_list(
                [
                    quote_column(c) + "=" + quote_value(get_if_type(v, c.type))
                    for k, v in command.set.items()
                    if get_type(v) != "nested"
                    for c in self.columns[k]
                    if c.type != "nested" and len(c.nested_path) == 1
                ] +
                [
                    quote_column(c) + "=" + SQL_NULL
                    for k in listwrap(command['clear'])
                    if k in self.columns
                    for c in self.columns[k]
                    if c.type != "nested" and len(c.nested_path) == 1
                ]
            ) +
            SQL_WHERE + where_sql
        )

        self.db.execute(command)