Exemplo n.º 1
0
def test_extract_job(complex_job, extract_job_settings):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    with MySQL(extract_job_settings.source.database) as source:
        with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor:
            sql = extractor.get_sql(
                SQL("SELECT " + text(complex_job.id) + " as id"))

            acc = []
            with source.transaction():
                cursor = list(source.query(sql, stream=True, row_tuples=True))
                extractor.construct_docs(cursor, acc.append, False)

    doc = first(acc)
    doc.guid = first(JOB).guid  # NEW EACH TIME

    job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid"))
    for fl in jx.drill(doc, "job_log.failure_line"):
        fl.job_guid = job_guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Exemplo n.º 2
0
    def __getitem__(self, item):
        # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN
        # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART
        # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]}
        # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING
        if is_data(item):
            coordinates = [None] * len(self.edges)

            # MAP DICT TO NUMERIC INDICES
            for name, v in item.items():
                ei, parts = first((i, e.domain.partitions)
                                  for i, e in enumerate(self.edges)
                                  if e.name == name)
                if not parts:
                    Log.error(
                        "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet",
                        name=name,
                        value=v)
                part = first(p for p in parts if p.value == v)
                if not part:
                    return Null
                else:
                    coordinates[ei] = part.dataIndex

            edges = [e for e, v in zip(self.edges, coordinates) if v is None]
            if not edges:
                # ZERO DIMENSIONAL VALUE
                return dict_to_data({
                    k: v.__getitem__(coordinates)
                    for k, v in self.data.items()
                })
            else:
                output = Cube(select=self.select,
                              edges=list_to_data([
                                  e for e, v in zip(self.edges, coordinates)
                                  if v is None
                              ]),
                              data={
                                  k: Matrix(values=c.__getitem__(coordinates))
                                  for k, c in self.data.items()
                              })
                return output
        elif is_text(item):
            # RETURN A VALUE CUBE
            if self.is_value:
                if item != self.select.name:
                    Log.error("{{name}} not found in cube", name=item)
                return self

            if item not in self.select.name:
                Log.error("{{name}} not found in cube", name=item)

            output = Cube(select=first(s for s in self.select
                                       if s.name == item),
                          edges=self.edges,
                          data={item: self.data[item]})
            return output
        else:
            Log.error("not implemented yet")
Exemplo n.º 3
0
 def filter(self, where):
     if len(self.edges) == 1 and first(self.edges).domain.type == "index":
         # USE THE STANDARD LIST FILTER
         from jx_python import jx
         return jx.filter(first(self.data.values()).cube, where)
     else:
         # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS
         Log.unexpected("Incomplete")
Exemplo n.º 4
0
def get_decoders_by_path(query, schema):
    """
    RETURN MAP FROM QUERY PATH TO LIST OF DECODER ARRAYS

    :param query:
    :return:
    """
    output = {}

    if query.edges:
        if query.sort and query.format != "cube":
            # REORDER EDGES/GROUPBY TO MATCH THE SORT
            query.edges = sort_edges(query, "edges")
    elif query.groupby:
        if query.sort and query.format != "cube":
            query.groupby = sort_edges(query, "groupby")

    for edge in to_data(coalesce(query.edges, query.groupby, [])):
        limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT)
        vars_ = coalesce(edge.value.vars(), set())

        if edge.range:
            vars_ |= edge.range.min.vars() | edge.range.max.vars()
            for v in vars_:
                if not schema[v.var]:
                    Log.error("{{var}} does not exist in schema", var=v)
        elif edge.domain.dimension:
            vars_ |= set(Variable(v) for v in edge.domain.dimension.fields)
            edge.domain.dimension = edge.domain.dimension.copy()
            edge.domain.dimension.fields = [
                schema[v.var].es_column for v in vars_
            ]
        elif edge.domain.partitions.where and all(
                edge.domain.partitions.where):
            for p in edge.domain.partitions:
                vars_ |= p.where.vars()
        else:
            # SIMPLE edge.value
            decoder = AggsDecoder(edge, query, limit)
            depths = set(c.nested_path[0] for v in vars_
                         for c in schema.leaves(v.var))
            output.setdefault(first(depths), []).append(decoder)
            continue

        depths = set(c.nested_path[0] for v in vars_
                     for c in schema.leaves(v.var))
        if not depths:
            Log.error("Do not know of column {{column}}",
                      column=unwraplist(
                          [v for v in vars_ if schema[v.var] == None]))
        if len(depths) > 1:
            Log.error("expression {{expr|quote}} spans tables, can not handle",
                      expr=edge.value)

        decoder = AggsDecoder(edge, query, limit)
        output.setdefault(first(depths), []).append(decoder)
    return output
Exemplo n.º 5
0
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = Painless[self.edge.value]
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        exists = Painless[AndOp([
            InOp([value, Literal(include)])
        ])].partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if is_op(value, Variable):
            es_field = first(self.query.frum.schema.leaves(value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match",
                {
                    "field": es_field,
                    "size": limit,
                    "order": {"_term": self.sorted} if self.sorted else None
                },
                self
            )
        else:
            match = TermsAggs(
                "_match",
                {
                    "script": text_type(value.to_es_script(self.schema)),
                    "size": limit
                },
                self
            )
        output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # FIND NULLS AT EACH NESTED LEVEL
            for p in self.schema.query_path:
                if p == query_path:
                    # MISSING AT THE QUERY DEPTH
                    output.add(
                        NestedAggs(p).add(FilterAggs("_missing0", NotOp(exists), self).add(es_query))
                    )
                else:
                    # PARENT HAS NO CHILDREN, SO MISSING
                    column = first(self.schema.values(query_path, (OBJECT, EXISTS)))
                    output.add(
                        NestedAggs(column.nested_path[0]).add(
                            FilterAggs(
                                "_missing1",
                                NotOp(ExistsOp(Variable(column.es_column.replace(NESTED_TYPE, EXISTS_TYPE)))),
                                self
                            ).add(es_query)
                        )
                    )
        return output
Exemplo n.º 6
0
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = Painless[self.edge.value]
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        exists = Painless[AndOp([InOp([value,
                                       Literal(include)])])].partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if is_op(value, Variable):
            es_field = first(self.query.frum.schema.leaves(
                value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match", {
                    "field": es_field,
                    "size": limit,
                    "order": {
                        "_term": self.sorted
                    } if self.sorted else None
                }, self)
        else:
            match = TermsAggs(
                "_match", {
                    "script": text_type(value.to_es_script(self.schema)),
                    "size": limit
                }, self)
        output = Aggs().add(
            FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # FIND NULLS AT EACH NESTED LEVEL
            for p in self.schema.query_path:
                if p == query_path:
                    # MISSING AT THE QUERY DEPTH
                    output.add(
                        NestedAggs(p).add(
                            FilterAggs("_missing0", NotOp(exists),
                                       self).add(es_query)))
                else:
                    # PARENT HAS NO CHILDREN, SO MISSING
                    column = first(
                        self.schema.values(query_path, (OBJECT, EXISTS)))
                    output.add(
                        NestedAggs(column.nested_path[0]).add(
                            FilterAggs(
                                "_missing1",
                                NotOp(
                                    ExistsOp(
                                        Variable(
                                            column.es_column.replace(
                                                NESTED_TYPE, EXISTS_TYPE)))),
                                self).add(es_query)))
        return output
Exemplo n.º 7
0
 def output():
     while True:
         with self.db.transaction() as t:
             top_id = first(
                 first(
                     t.query(SQL_SELECT + quote_column("next_id") +
                             SQL_FROM +
                             quote_column(ABOUT_TABLE)).data))
             max_id = top_id + 1000
             t.execute(SQL_UPDATE + quote_column(ABOUT_TABLE) +
                       SQL_SET + sql_eq(next_id=max_id))
         while top_id < max_id:
             yield top_id
             top_id += 1
Exemplo n.º 8
0
def is_bulk_agg(esq, query):
    # ONLY ACCEPTING ONE DIMENSION AT THIS TIME
    if not S3_CONFIG:
        return False
    if query.destination not in {"s3", "url"}:
        return False
    if query.format not in {"list", "table"}:
        return False
    if len(listwrap(query.groupby)) != 1:
        return False

    gb = first(_normalize_group(first(listwrap(query.groupby)), 0, query.limit))
    if not is_op(gb.value, Variable):
        return False
    return True
Exemplo n.º 9
0
 def append_query(self, query_path, es_query):
     if is_op(self.edge.value, FirstOp) and is_op(self.edge.value.term, Variable):
         self.edge.value = self.edge.value.term  # ES USES THE FIRST TERM FOR {"terms": } AGGREGATION
     if not is_op(self.edge.value, Variable):
         terms = TermsAggs(
             "_match",
             {
                 "script": {"lang": "painless", "inline": self.script.expr},
                 "size": self.domain.limit,
                 "order": self.es_order
             },
             self
         )
     else:
         terms = TermsAggs(
             "_match", {
                 "field": first(self.schema.leaves(self.edge.value.var)).es_column,
                 "size": self.domain.limit,
                 "order": self.es_order
             },
             self
         )
     output = Aggs()
     output.add(FilterAggs("_filter", self.exists, None).add(terms.add(es_query)))
     output.add(FilterAggs("_missing", self.missing, self).add(es_query))
     return output
Exemplo n.º 10
0
    def append_query(self, query_path, es_query):
        es_field = first(self.query.frum.schema.leaves(self.var)).es_column

        return Aggs().add(TermsAggs("_match", {
            "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}),
            "size": self.limit
        }, self).add(es_query))
Exemplo n.º 11
0
def aggop_to_es_queries(select, query_path, schema, query):
    base_agg = extract_aggs(select, query_path, schema)
    base_agg = NestedAggs(query_path).add(base_agg)

    all_paths, split_decoders, var_to_columns = pre_process(query)

    # WE LET EACH DIMENSION ADD ITS OWN CODE FOR HANDLING INNER JOINS
    concat_outer = query_to_outer_joins(query, all_paths, {}, var_to_columns)

    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    output = NestedAggs(".")
    for i, outer in enumerate(concat_outer.terms):
        acc = base_agg
        for p, path in enumerate(all_paths):
            decoder = split_decoders.get(path, Null)

            for d in decoder:
                decoders[d.edge.dim] = d
                acc = d.append_query(path, acc)
                start += d.num_columns

            where = first(nest.where for nest in outer.nests if nest.path == path).partial_eval()
            if where is FALSE:
                continue
            elif not where or where is TRUE:
                pass
            else:
                acc = FilterAggs("_filter" + text(i) + text(p), where, None).add(acc)
            acc = NestedAggs(path).add(acc)
        output.add(acc)
    output = simplify(output)
    es_query = to_data(output.to_es(schema))
    es_query.size = 0
    return output, decoders, es_query
Exemplo n.º 12
0
 def _gen_ids():
     while True:
         with db.transaction() as t:
             top_id = first(
                 first(
                     t.query(
                         sql_query({
                             "select": "next_id",
                             "from": version_table
                         })).data))
             max_id = top_id + 1000
             t.execute(SQL_UPDATE + quote_column(version_table) + SQL_SET +
                       sql_eq(next_id=max_id))
         while top_id < max_id:
             yield top_id
             top_id += 1
Exemplo n.º 13
0
    def to_esfilter(self, schema):
        if is_op(self.value, Variable_):
            var = self.value.var
            cols = schema.leaves(var)
            if not cols:
                return MATCH_NONE
            col = first(cols)
            var = col.es_column

            if col.jx_type == BOOLEAN:
                if is_literal(
                        self.superset) and not is_many(self.superset.value):
                    return {"term": {var: value2boolean(self.superset.value)}}
                else:
                    return {
                        "terms": {
                            var: map(value2boolean, self.superset.value)
                        }
                    }
            else:
                if is_literal(
                        self.superset) and not is_many(self.superset.value):
                    return {"term": {var: self.superset.value}}
                else:
                    return {"terms": {var: self.superset.value}}
        else:
            return Painless[self].to_es_script(schema).to_esfilter(schema)
Exemplo n.º 14
0
    def get_or_create_user(self, details):
        details = wrap(details)
        issuer = details.sub or details.issuer
        email = details.email
        email_verified = details.email_verified
        if not email:
            Log.error("Expecting id_token to have claims.email propert")

        result = self.db.query(
            sql_query({
                "select": ["_id", "email", "issuer"],
                "from": GROUP_TABLE,
                "where": {
                    "eq": {
                        "email": email,
                        "issuer": issuer
                    }
                },
            }))

        if result.data:
            user = Data(zip(result.header, first(result.data)))
            user.email_verified = email_verified
            return user

        new_user = wrap({
            "email": email,
            "issuer": issuer,
            "email_verified": email_verified,
            "owner": ROOT_USER._id
        })
        self._insert(GROUP_TABLE, new_user)
        return new_user
Exemplo n.º 15
0
    def select(self, select):
        selects = listwrap(select)

        if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".":
            new_schema = self.schema
            if selects[0].name == ".":
                return self
        else:
            new_schema = None

        if is_list(select):
            if all(
                is_op(s.value, Variable) and s.name == s.value.var
                for s in select
            ):
                names = set(s.value.var for s in select)
                new_schema = Schema(".", [c for c in self.schema.columns if c.name in names])

            push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
            def selector(d):
                output = Data()
                for n, p in push_and_pull:
                    output[n] = unwraplist(p(wrap(d)))
                return unwrap(output)

            new_data = map(selector, self.data)
        else:
            select_value = jx_expression_to_function(select.value)
            new_data = map(select_value, self.data)
            if is_op(select.value, Variable):
                column = copy(first(c for c in self.schema.columns if c.name == select.value.var))
                column.name = '.'
                new_schema = Schema("from " + self.name, [column])

        return ListContainer("from "+self.name, data=new_data, schema=new_schema)
Exemplo n.º 16
0
    def select(self, select):
        selects = listwrap(select)

        if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".":
            new_schema = self.schema
            if selects[0].name == ".":
                return self
        else:
            new_schema = None

        if is_list(select):
            if all(
                is_op(s.value, Variable) and s.name == s.value.var
                for s in select
            ):
                names = set(s.value.var for s in select)
                new_schema = Schema(".", [c for c in self.schema.columns if c.name in names])

            push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
            def selector(d):
                output = Data()
                for n, p in push_and_pull:
                    output[n] = unwraplist(p(to_data(d)))
                return unwrap(output)

            new_data = list(map(selector, self.data))
        else:
            select_value = jx_expression_to_function(select.value)
            new_data = list(map(select_value, self.data))
            if is_op(select.value, Variable):
                column = dict(**first(c for c in self.schema.columns if c.name == select.value.var))
                column.update({"name": ".", "jx_type": NESTED, "es_type": "nested", "multi":1001, "cardinality":1})
                new_schema = Schema("from " + self.name, [Column(**column)])

        return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def download_perfherder(desc, repo, id, dummy, framework):
    sig_result = http.get_json(
        "https://treeherder.mozilla.org/api/project/"
        + repo
        + "/performance/signatures/?format=json&framework="
        + str(framework)
        + "&id="
        + str(id)
    )

    signature = first(sig_result.keys())
    data_result = http.get_json(
        "https://treeherder.mozilla.org/api/project/"
        + repo
        + "/performance/data/?signatures="
        + signature
    )

    Log.note(
        "{{result|json}}",
        result={
            "name": desc,
            "data": jx.run({
                "from": ListContainer("data", data_result[signature]),
                "sort": "push_timestamp",
                "select": "value"
            }).data
        },
    )
Exemplo n.º 18
0
 def append_query(self, query_path, es_query):
     if is_op(self.edge.value, FirstOp) and is_op(self.edge.value.term,
                                                  Variable):
         self.edge.value = self.edge.value.term  # ES USES THE FIRST TERM FOR {"terms": } AGGREGATION
     if not is_op(self.edge.value, Variable):
         terms = TermsAggs(
             "_match", {
                 "script": {
                     "lang": "painless",
                     "inline": self.script.expr
                 },
                 "size": self.domain.limit,
                 "order": self.es_order
             }, self)
     else:
         terms = TermsAggs(
             "_match", {
                 "field": first(self.schema.leaves(
                     self.edge.value.var)).es_column,
                 "size": self.domain.limit,
                 "order": self.es_order
             }, self)
     output = Aggs()
     output.add(
         FilterAggs("_filter", self.exists, None).add(terms.add(es_query)))
     output.add(FilterAggs("_missing", self.missing, self).add(es_query))
     return output
Exemplo n.º 19
0
    def select(self, select):
        selects = listwrap(select)

        if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".":
            new_schema = self.schema
            if selects[0].name == ".":
                return self
        else:
            new_schema = None

        if is_list(select):
            if all(
                is_op(s.value, Variable) and s.name == s.value.var
                for s in select
            ):
                names = set(s.value.var for s in select)
                new_schema = Schema(".", [c for c in self.schema.columns if c.name in names])

            push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
            def selector(d):
                output = Data()
                for n, p in push_and_pull:
                    output[n] = unwraplist(p(wrap(d)))
                return unwrap(output)

            new_data = map(selector, self.data)
        else:
            select_value = jx_expression_to_function(select.value)
            new_data = map(select_value, self.data)
            if is_op(select.value, Variable):
                column = copy(first(c for c in self.schema.columns if c.name == select.value.var))
                column.name = '.'
                new_schema = Schema("from " + self.name, [column])

        return ListContainer("from "+self.name, data=new_data, schema=new_schema)
Exemplo n.º 20
0
    def window(self, window):
        if window.edges or window.sort:
            raise NotImplementedError()

        from jx_python import jx

        # SET OP
        canonical = first(self.data.values())
        accessor = jx.get(window.value)
        cnames = self.data.keys()

        # ANNOTATE EXISTING CUBE WITH NEW COLUMN
        m = self.data[window.name] = Matrix(dims=canonical.dims)
        for coord in canonical._all_combos():
            row = Data(
            )  # IT IS SAD WE MUST HAVE A Data(), THERE ARE {"script": expression} USING THE DOT NOTATION
            for k in cnames:
                row[k] = self.data[k][coord]
            for c, e in zip(coord, self.edges):
                row[e.name] = e.domain.partitions[c]
            m[coord] = accessor(
                row, Null,
                Null)  # DUMMY Null VALUES BECAUSE I DO NOT KNOW WHAT TO DO

        self.select.append(window)
        return self
Exemplo n.º 21
0
 def type(self):
     types = set(w.then.type if is_op(w, WhenOp) else w.type
                 for w in self.whens)
     if len(types) > 1:
         return OBJECT
     else:
         return first(types)
Exemplo n.º 22
0
    def to_esfilter(self, schema):
        if is_op(self.value, Variable_):
            var = self.value.var
            cols = schema.leaves(var)
            if not cols:
                Log.error("expecting {{var}} to be a column", var=var)
            col = first(cols)
            var = col.es_column

            if col.jx_type == BOOLEAN:
                if is_literal(self.superset) and not is_sequence(
                        self.superset.value):
                    return {"term": {var: value2boolean(self.superset.value)}}
                else:
                    return {
                        "terms": {
                            var: map(value2boolean, self.superset.value)
                        }
                    }
            else:
                if is_literal(self.superset) and not is_sequence(
                        self.superset.value):
                    return {"term": {var: self.superset.value}}
                else:
                    return {"terms": {var: self.superset.value}}
        else:
            return Painless[self].to_es_script(schema).to_esfilter(schema)
Exemplo n.º 23
0
def _range_composer(self, edge, domain, es_query, to_float, schema):
    # USE RANGES
    _min = coalesce(domain.min, MIN(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    output = Aggs()
    if edge.allowNulls:
        output.add(
            FilterAggs(
                "_missing",
                NotOp(
                    AndOp([
                        edge.value.exists(),
                        GteOp([edge.value, Literal(to_float(_min))]),
                        LtOp([edge.value, Literal(to_float(_max))])
                    ]).partial_eval()), self).add(es_query))

    if is_op(edge.value, Variable):
        calc = {"field": first(schema.leaves(edge.value.var)).es_column}
    else:
        calc = {"script": text_type(Painless[edge.value].to_es_script(schema))}
    calc['ranges'] = [{
        "from": to_float(p.min),
        "to": to_float(p.max)
    } for p in domain.partitions]

    return output.add(RangeAggs("_match", calc, self).add(es_query))
Exemplo n.º 24
0
    def verify_jwt_token(self, token):
        jwks = http.get_json("https://" + self.auth0.domain +
                             "/.well-known/jwks.json")
        unverified_header = jwt.get_unverified_header(token)
        algorithm = unverified_header["alg"]
        if algorithm != "RS256":
            Log.error("Expecting a RS256 signed JWT Access Token")

        key_id = unverified_header["kid"]
        key = unwrap(first(key for key in jwks["keys"]
                           if key["kid"] == key_id))
        if not key:
            Log.error("could not find {{key}}", key=key_id)

        try:
            return jwt.decode(
                token,
                key,
                algorithms=algorithm,
                audience=self.auth0.api.identifier,
                issuer="https://" + self.auth0.domain + "/",
            )
        except jwt.ExpiredSignatureError as e:
            Log.error("Token has expired", code=403, cause=e)
        except jwt.JWTClaimsError as e:
            Log.error(
                "Incorrect claims, please check the audience and issuer",
                code=403,
                cause=e,
            )
        except Exception as e:
            Log.error("Problem parsing", cause=e)
Exemplo n.º 25
0
 def to_es(self, schema):
     if is_op(self.lhs, Variable_) and is_literal(self.rhs):
         lhs = self.lhs.var
         cols = schema.leaves(lhs)
         if cols:
             lhs = first(cols).es_column
         rhs = self.rhs.value
         if is_many(rhs):
             if len(rhs) == 1:
                 return {"term": {lhs: first(rhs)}}
             else:
                 return {"terms": {lhs: rhs}}
         else:
             return {"term": {lhs: rhs}}
     else:
         return Painless[self].to_es_script(schema).to_es(schema)
 def to_es(self, schema):
     if not self.suffix:
         return MATCH_ALL
     elif is_op(self.expr, Variable_) and is_literal(self.suffix):
         var = first(schema.leaves(self.expr.var)).es_column
         return {"regexp": {var: ".*" + string2regexp(self.suffix.value)}}
     else:
         return PainlessSuffixOp.to_es_script(self, schema).to_es(schema)
Exemplo n.º 27
0
 def define(cls, expr):
     term = expr.get('prefix')
     if not term:
         return PrefixOp(NULL, NULL)
     elif is_data(term):
         expr, const = first(term.items())
         return PrefixOp(Variable(expr), Literal(const))
     else:
         return PrefixOp(*term)
Exemplo n.º 28
0
def sql_lt(**item):
    """
    RETURN SQL FOR LESS-THAN (<) COMPARISION BETWEEN VARIABLES TO VALUES

    :param item: keyword parameters representing variable and value
    :return: SQL
    """
    k, v = first(item.items())
    return ConcatSQL(quote_column(k), SQL_LT, quote_value(v))
Exemplo n.º 29
0
 def define(cls, expr):
     term = expr.get("prefix")
     if not term:
         return PrefixOp(NULL, NULL)
     elif is_data(term):
         expr, const = first(term.items())
         return PrefixOp(Variable(expr), Literal(const))
     else:
         expr, const = term
         return PrefixOp(jx_expression(expr), jx_expression(const))
Exemplo n.º 30
0
 def __init__(self, terms, **clauses):
     Expression.__init__(self, terms)
     if is_data(terms):
         self.terms = first(terms.items())
     else:
         self.terms = terms
     self.separator = clauses.get(str("separator"), Literal(""))
     self.default = clauses.get(str("default"), NULL)
     if not is_literal(self.separator):
         Log.error("Expecting a literal separator")
Exemplo n.º 31
0
 def to_esfilter(self, schema):
     if not self.value:
         return MATCH_ALL
     elif is_op(self.value, Variable_) and is_literal(self.prefix):
         var = first(schema.leaves(self.value.var)).es_column
         return {"prefix": {var: self.prefix.value}}
     else:
         output = PainlessBasicStartsWithOp.self.to_es_script(self, schema)
         if output is false_script:
             return MATCH_NONE
         return output
Exemplo n.º 32
0
 def to_esfilter(self, schema):
     if is_literal(self.pattern) and is_op(self.var, Variable_):
         cols = schema.leaves(self.var.var)
         if len(cols) == 0:
             return MATCH_NONE
         elif len(cols) == 1:
             return {"regexp": {first(cols).es_column: self.pattern.value}}
         else:
             Log.error("regex on not supported ")
     else:
         Log.error("regex only accepts a variable and literal pattern")
Exemplo n.º 33
0
 def __data__(self):
     if first(self.schema.columns).name=='.':
         return wrap({
             "meta": {"format": "list"},
             "data": self.data
         })
     else:
         return wrap({
             "meta": {"format": "list"},
             "data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data]
         })
 def to_es(self, schema):
     if is_op(self.expr, Variable_):
         cols = schema.leaves(self.expr.var)
         if not cols:
             return MATCH_ALL
         elif len(cols) == 1:
             return es_missing(first(cols).es_column)
         else:
             return es_and([es_missing(c.es_column) for c in cols])
     else:
         return PainlessMissingOp.to_es_script(self, schema).to_es(schema)
Exemplo n.º 35
0
    def append_query(self, query_path, es_query):
        decoder = self
        for i, v in enumerate(self.fields):
            exists = v.exists().partial_eval()
            nest = Aggs()
            nest.add(TermsAggs("_match", {
                "field": first(self.schema.leaves(v.var)).es_column,
                "size": self.domain.limit
            }, decoder).add(es_query))
            nest.add(FilterAggs("_missing", NotOp(exists), decoder).add(es_query))
            es_query = nest
            decoder = None

        if self.domain.where:
            es_query = FilterAggs("_filter", self.domain.where, None).add(es_query)

        return es_query
Exemplo n.º 36
0
def _range_composer(self, edge, domain, es_query, to_float, schema):
    # USE RANGES
    _min = coalesce(domain.min, MIN(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    output = Aggs()
    if edge.allowNulls:
        output.add(FilterAggs(
            "_missing",
            NotOp(AndOp([
                edge.value.exists(),
                GteOp([edge.value, Literal(to_float(_min))]),
                LtOp([edge.value, Literal(to_float(_max))])
            ]).partial_eval()),
            self
        ).add(es_query))

    if is_op(edge.value, Variable):
        calc = {"field": first(schema.leaves(edge.value.var)).es_column}
    else:
        calc = {"script": text_type(Painless[edge.value].to_es_script(schema))}
    calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]

    return output.add(RangeAggs("_match", calc, self).add(es_query))
Exemplo n.º 37
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    split_select = {".": ESSelect('.')}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select


    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select('.').use_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."},
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."}
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select('.').use_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": put_index, "child": "."},
                    "pull": get_pull_source(".")
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select('.').use_source = True
                    for c in leaves:
                        if len(c.nested_path) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(decode_property(n) for n in split_field(c.name))
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                "pull": get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": "."},
                                    "pull": lambda row: row._id
                                })
                            elif c.jx_type == NESTED:
                                get_select('.').use_source = True
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                    "pull": get_pull_source(c.es_column)
                                })
                            else:
                                get_select(c_nested_path).fields.append(c.es_column)
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column)
                            pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path))))
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child
                                },
                                "pull": pull
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value, schema, lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))}
                new_select.append({
                    "name": select.name,
                    "pull": jx_expression_to_function("fields." + literal_field(select.name)),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
                put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select('.').use_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=True) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    # Log.note("{{output}}", output=T)

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Exemplo n.º 38
0
    def append_query(self, query_path, es_query):
        es_field = first(self.query.frum.schema.leaves(self.var)).es_column

        return Aggs().add(TermsAggs("_match", {
            "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'})
        }, self).add(es_query))
Exemplo n.º 39
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            # if query.groupby:
            #     return object.__new__(DefaultDecoder, e)

            if is_text(e.value):
                Log.error("Expecting Variable or Expression, not plain string")

            if is_op(e.value, LeavesOp):
                return object.__new__(ObjectDecoder)
            elif is_op(e.value, TupleOp):
                # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
                # JUST PULL THE FIELDS
                if not all(is_op(t, Variable) for t in e.value.terms):
                    Log.error("Can only handle variables in tuples")

                e.domain = Data(
                    dimension={"fields": e.value.terms}
                )
                return object.__new__(DimFieldListDecoder)

            elif is_op(e.value, Variable):
                schema = query.frum.schema
                cols = schema.leaves(e.value.var)
                if not cols:
                    return object.__new__(DefaultDecoder)
                if len(cols) != 1:
                    return object.__new__(ObjectDecoder)
                col = first(cols)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.cardinality == None:
                    DEBUG and Log.warning(
                        "metadata for column {{name|quote}} (id={{id}}) is not ready",
                        name=concat_field(col.es_index, col.es_column),
                        id=id(col)
                    )
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                elif col.partitions == None:
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                else:
                    DEBUG and Log.note("id={{id}} has parts!!!", id=id(col))
                    if col.multi > 1 and len(col.partitions) < 10:
                        return object.__new__(MultivalueDecoder)

                    partitions = col.partitions[:limit:]
                    if e.domain.sort == -1:
                        partitions = list(reversed(sorted(partitions)))
                    else:
                        partitions = sorted(partitions)
                    e.domain = SimpleSetDomain(partitions=partitions, limit=limit)

            else:
                return object.__new__(DefaultDecoder)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder)
        if e.range:
            return object.__new__(GeneralRangeDecoder)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if is_data(fields):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder)
        else:
            Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)