Пример #1
0
def compare_to_expected(query, result, expect, places):
    query = wrap(query)
    expect = wrap(expect)

    if result.meta.format == "table":
        try:
            assertAlmostEqual(set(result.header), set(expect.header))
        except Exception as e:
            Log.error("format=table headers do not match", cause=e)

        # MAP FROM expected COLUMN TO result COLUMN
        mapping = transpose(*transpose(*filter(
            lambda v: v[0][1] == v[1][1],
            itertools.product(enumerate(expect.header), enumerate(result.header))
        ))[1])[0]
        result.header = [result.header[m] for m in mapping]

        if result.data:
            columns = transpose(*unwrap(result.data))
            result.data = transpose(*(columns[m] for m in mapping))

        if not query.sort:
            sort_table(result)
            sort_table(expect)
    elif result.meta.format == "list":
        if not query.sort:
            try:
                # result.data MAY BE A LIST OF VALUES, NOT OBJECTS
                data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name")
            except Exception:
                data_columns = [{"name": "."}]

            sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns

            if is_list(expect.data):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception as _:
                    pass

            if is_list(result.data):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception as _:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_header = map(literal_field, result_header)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_header = map(literal_field, expect_header)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=places)
Пример #2
0
def compare_to_expected(query, result, expect):
    query = wrap(query)
    expect = wrap(expect)

    if result.meta.format == "table":
        assertAlmostEqual(set(result.header), set(expect.header))

        # MAP FROM expected COLUMN TO result COLUMN
        mapping = list(zip(*list(zip(*filter(
            lambda v: v[0][1] == v[1][1],
            itertools.product(enumerate(expect.header), enumerate(result.header))
        )))[1]))[0]
        result.header = [result.header[m] for m in mapping]

        if result.data:
            columns = list(zip(*unwrap(result.data)))
            result.data = zip(*[columns[m] for m in mapping])

        if not query.sort:
            sort_table(result)
            sort_table(expect)
    elif result.meta.format == "list":
        if query["from"].startswith("meta."):
            pass
        else:
            query = QueryOp.wrap(query, query.frum, query.schema)

        if not query.sort:
            try:
                #result.data MAY BE A LIST OF VALUES, NOT OBJECTS
                data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name")
            except Exception as _:
                data_columns = [{"name": "."}]

            sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns

            if isinstance(expect.data, list):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception:
                    pass

            if isinstance(result.data, list):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=6)
Пример #3
0
def sort_table(result):
    """
    SORT ROWS IN TABLE, EVEN IF ELEMENTS ARE JSON
    """
    data = wrap([{text(i): v for i, v in enumerate(row) if v != None} for row in result.data])
    sort_columns = jx.sort(set(jx.get_columns(data, leaves=True).name))
    data = jx.sort(data, sort_columns)
    result.data = [tuple(row[text(i)] for i in range(len(result.header))) for row in data]
Пример #4
0
    def test_sort_value(self):
        data = [4, 5, 3, 2, 1]
        result = jx.sort(data, {"value": ".", "sort": -1})
        expected = [5, 4, 3, 2, 1]
        self.assertEqual(result, expected)

        result = jx.sort(data, ".")
        expected = [1, 2, 3, 4, 5]
        self.assertEqual(result, expected)
Пример #5
0
 def _merge(*schemas):
     if len(schemas) == 1:
         return schemas[0]
     try:
         if any(NESTED_TYPE in s for s in schemas):
             # IF THERE ARE ANY ARRAYS, THEN THE MERGE IS AN ARRAY
             new_schemas = []
             for schema in schemas:
                 if NESTED_TYPE in schema:
                     sub_schema = schema[NESTED_TYPE]
                     residue = {k: v for k, v in schema.items() if k != NESTED_TYPE}
                     new_schemas.append(_merge(sub_schema, residue))
                 else:
                     new_schemas.append(schema)
             return {NESTED_TYPE: _merge(*new_schemas)}
         else:
             return OrderedDict(
                 (k, _merge(*(ss for s in schemas for ss in [s.get(k)] if ss)))
                 for k in jx.sort(set(k for s in schemas for k in s.keys()))
             )
     except Exception as e:
         e = Except.wrap(e)
         if "Expecting types to match" in e:
             raise e
         t = list(set(schemas))
         if len(t) == 1:
             return t[0]
         elif len(t) == 2 and STRING in t and NUMBER in t:
             return STRING
         else:
             Log.error("Expecting types to match {{types|json}}", types=t)
Пример #6
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + SQL_WHERE +
                "_id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])})

            command = (SQL_INSERT + self.quote_column(table_name) +
                       sql_iso(sql_list(self.quote_column(k)
                                        for k in columns)) + SQL_VALUES +
                       sql_iso(
                           sql_list(
                               self.quote_value(r.get(k, None))
                               for k in columns for r in records)))
            self.execute(command)
        except Exception as e:
            Log.error("problem with insert", e)
Пример #7
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])}
            )

            command = (
                "INSERT INTO " + self.quote_column(table_name) + "(" +
                ",".join([self.quote_column(k) for k in columns]) +
                ") VALUES " + ",\n".join([
                sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns]))
                for r in records
            ])
            )
            self.execute(command)
        except Exception as e:
            Log.error("problem with insert", e)
Пример #8
0
    def _update_meta(self):
        if not self.dirty:
            return

        now = Date.now()
        for mc in META_COLUMNS_DESC.columns:
            count = 0
            values = set()
            objects = 0
            multi = 1
            for column in self._all_columns():
                value = column[mc.name]
                if value == None:
                    pass
                else:
                    count += 1
                    if is_list(value):
                        multi = max(multi, len(value))
                        try:
                            values |= set(value)
                        except Exception:
                            objects += len(value)
                    elif is_data(value):
                        objects += 1
                    else:
                        values.add(value)
            mc.count = count
            mc.cardinality = len(values) + objects
            mc.partitions = jx.sort(values)
            mc.multi = multi
            mc.last_updated = now

        META_COLUMNS_DESC.last_updated = now
        self.dirty = False
Пример #9
0
    def test_bulk_aggs_list(self):
        data = wrap([{"a": "test" + text(i)} for i in range(10111)])
        expected = jx.sort([{"a": r.a, "count": 1} for r in data], "a")

        test = wrap({
            "data": data,
            "query": {
                "from": TEST_TABLE,
                "groupby": "a",
                "limit": len(data),
                "chunk_size": 1000,
                "sort": "a",
            },
            "expecting_list": {
                "data": expected[:MAX_LIMIT]
            },  # DUMMY, TO ENSURE LOADED
        })
        self.utils.execute_tests(test)

        test.query.format = "list"
        test.query.destination = "url"
        result = http.post_json(
            url=self.utils.testing.query,
            json=test.query,
        )
        self.assertEqual(result.meta.format, "list")

        @self.retry(result.url)
        def get_content():
            content = http.get_json(result.url)
            self.assertEqual(content.meta.format, "list")
            sorted_content = jx.sort(content.data, "a")
            sorted_expected = jx.sort(expected, "a")
            self.assertEqual(sorted_content, sorted_expected)
Пример #10
0
def groupby(data, keys=None, contiguous=False):
    """
    :param data: list of data to group
    :param keys: (list of) property path name
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    try:
        if not data:
            return Null

        keys = listwrap(keys)
        if not contiguous:
            from jx_python import jx
            data = jx.sort(data, keys)

        if len(keys) == 0 or len(keys) == 1 and keys[0] == '.':
            return _groupby_value(data)

        if any(is_expression(k) for k in keys):
            raise Log.error("can not handle expressions")

        accessor = jx_expression_to_function(jx_expression(
            {"tuple":
             keys}))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__
        return _groupby_keys(data, keys, accessor)
    except Exception as e:
        Log.error("Problem grouping", cause=e)
Пример #11
0
    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.name]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if is_list(value):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif is_data(value):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False
Пример #12
0
 def get_content():
     content = http.get_json(result.url)
     self.assertEqual(content.header, ["a"])
     self.assertEqual(content.meta.format, "table")
     sorted_content = jx.sort(content.data, 0)
     sorted_expected = [(row.a, ) for row in expected]
     self.assertEqual(sorted_content, sorted_expected)
Пример #13
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])}
            )

            command = (
                "INSERT INTO " + self.quote_column(table_name) + "(" +
                ",".join([self.quote_column(k) for k in columns]) +
                ") VALUES " + ",\n".join([
                sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns]))
                for r in records
            ])
            )
            self.execute(command)
        except Exception as e:
            Log.error("problem with insert", e)
Пример #14
0
    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.names["."]]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if isinstance(value, list):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif isinstance(value, Mapping):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False
Пример #15
0
    def _edges_op(self, query, frum):
        query = query.copy()  # WE WILL BE MARKING UP THE QUERY
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        outer_selects = []  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        frum_path = split_field(frum)
        base_table = join_field(frum_path[0:1])
        path = join_field(frum_path[1:])
        nest_to_alias = {
            nested_path: quote_column("__" + unichr(ord('a') + i) + "__")
            for i, (nested_path, sub_table) in enumerate(self.sf.tables.items())
        }

        schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema

        tables = []
        for n, a in nest_to_alias.items():
            if startswith_field(path, n):
                tables.append({"nest": n, "alias": a})
        tables = jx.sort(tables, {"value": {"length": "nest"}})

        from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias
        for previous, t in zip(tables, tables[1::]):
            from_sql += (
                SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias +
                SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID)
            )

        main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b

        # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH
        ons = []
        join_types = []
        wheres = []
        null_ons = [EXISTS_COLUMN + SQL_IS_NULL]
        groupby = []
        null_groupby = []
        orderby = []
        domains = []

        select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables['.'].columns]

        for edge_index, query_edge in enumerate(query.edges):
            edge_alias = quote_column("e" + text_type(edge_index))

            if query_edge.value:
                edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()]

            elif not query_edge.value and any(query_edge.domain.partitions.where):
                case = SQL_CASE
                for pp, p in enumerate(query_edge.domain.partitions):
                    w = p.where.to_sql(schema)[0].sql.b
                    t = quote_value(pp)
                    case += SQL_WHEN + w + SQL_THEN + t
                case += SQL_ELSE + SQL_NULL + SQL_END  # quote value with length of partitions
                edge_values = [("n", case)]

            elif query_edge.range:
                edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[
                    0].sql.items()
Пример #16
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(name=alias,
                                  url=None,
                                  query_path=['.'],
                                  timestamp=Date.MIN)
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update",
                                 num=len([
                                     c for c in columns if not c.last_updated
                                 ]))
                    else:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []
Пример #17
0
    def done_count(self):
        columns = map(text_type, range(len(self.fields)))
        parts = wrap([{text_type(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = jx.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)]
        )
Пример #18
0
    def done_count(self):
        columns = map(text_type, range(len(self.fields)))
        parts = wrap([{text_type(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = jx.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)]
        )
Пример #19
0
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False):
    """
    :param data:
    :param keys:
    :param size:
    :param min_size:
    :param max_size:
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
        contiguous -
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    if size != None or min_size != None or max_size != None:
        if size != None:
            max_size = size
        return groupby_min_max_size(data, min_size=min_size, max_size=max_size)

    try:
        keys = listwrap(keys)
        if not contiguous:
            from jx_python import jx
            data = jx.sort(data, keys)

        if not data:
            return Null

        if any(isinstance(k, Expression) for k in keys):
            Log.error("can not handle expressions")
        else:
            accessor = jx_expression_to_function(jx_expression({"tuple": keys}))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__

        def _output():
            start = 0
            prev = accessor(data[0])
            for i, d in enumerate(data):
                curr = accessor(d)
                if curr != prev:
                    group = {}
                    for k, gg in zip(keys, prev):
                        group[k] = gg
                    yield Data(group), data[start:i:]
                    start = i
                    prev = curr
            group = {}
            for k, gg in zip(keys, prev):
                group[k] = gg
            yield Data(group), data[start::]

        return _output()
    except Exception as e:
        Log.error("Problem grouping", cause=e)
Пример #20
0
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter": simplify_esfilter({"and": [
                    query.where,
                    {"term": {query.edges[0].value: v}}
                ]})
            }

    data = es_post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = jx.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
Пример #21
0
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter": simplify_esfilter({"and": [
                    query.where,
                    {"term": {query.edges[0].value: v}}
                ]})
            }

    data = es_post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = jx.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
    def save_money(self, remaining_budget, net_new_utility):
        remove_spot_requests = wrap([])

        # FIRST CANCEL THE PENDING REQUESTS
        if remaining_budget < 0:
            requests = self._get_managed_spot_requests()
            for r in requests:
                if r.status.code in PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN:
                    remove_spot_requests.append(r.id)
                    net_new_utility += self.settings.utility[
                        r.launch_specification.instance_type].utility
                    remaining_budget += r.price

        instances = jx.sort(self.running_instances(), "markup.estimated_value")

        remove_list = wrap([])
        for s in instances:
            if remaining_budget >= 0:
                break
            remove_list.append(s)
            net_new_utility += coalesce(s.markup.type.utility, 0)
            remaining_budget += coalesce(s.request.bid_price,
                                         s.markup.price_80,
                                         s.markup.current_price)

        if not remove_list:
            return remaining_budget, net_new_utility

        # SEND SHUTDOWN TO EACH INSTANCE
        Log.warning("Shutdown {{instances}} to save money!",
                    instances=remove_list.id)
        if ALLOW_SHUTDOWN:
            for g, removals in jx.chunk(remove_list, size=20):
                for i, t in [(i,
                              Thread.run("teardown " + i.id,
                                         self.instance_manager.teardown,
                                         i,
                                         please_stop=False))
                             for i in removals]:
                    try:
                        t.join()
                    except Exception:
                        Log.note("Problem with shutdown of {{id}}", id=i.id)

            remove_spot_requests.extend(remove_list.spot_instance_request_id)

            # TERMINATE INSTANCES
            self.ec2_conn.terminate_instances(instance_ids=remove_list.id)

            # TERMINATE SPOT REQUESTS
            self.ec2_conn.cancel_spot_instance_requests(
                request_ids=remove_spot_requests)
        return remaining_budget, net_new_utility
Пример #23
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=['.'],
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
Пример #24
0
def json_schema_to_markdown(schema):
    from jx_python import jx

    def _md_code(code):
        return "`" + code + "`"

    def _md_italic(value):
        return "*" + value + "*"

    def _inner(schema, parent_name, indent):
        more_lines = []
        for k, v in schema.items():
            full_name = concat_field(parent_name, k)
            details = indent + "* " + _md_code(full_name)
            if v.type:
                details += " - " + _md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(
                    _inner(v.properties, full_name, indent + "  "))
        return more_lines

    lines = []
    if schema.title:
        lines.append("#" + schema.title)

    lines.append(schema.description)
    lines.append("")

    for k, v in jx.sort(schema.properties.items(), 0):
        full_name = k
        if v.type in ["object", "array", "nested"]:
            lines.append("##" + _md_code(full_name) + " Property")
            if v.description:
                lines.append(v.description)
            lines.append("")

            if v.type in ["object", "array", "nested"]:
                lines.extend(_inner(v.properties, full_name, "  "))
        else:
            lines.append("##" + _md_code(full_name) + " (" + v.type + ")")
            if v.description:
                lines.append(v.description)

    return "\n".join(lines)
Пример #25
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=['.'],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated]))
                    else:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []
Пример #26
0
def json_schema_to_markdown(schema):
    from jx_python import jx

    def _md_code(code):
        return "`"+code+"`"

    def _md_italic(value):
        return "*"+value+"*"

    def _inner(schema, parent_name, indent):
        more_lines = []
        for k,v in schema.items():
            full_name = concat_field(parent_name, k)
            details = indent+"* "+_md_code(full_name)
            if v.type:
                details += " - "+_md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(_inner(v.properties, full_name, indent+"  "))
        return more_lines

    lines = []
    if schema.title:
        lines.append("#"+schema.title)

    lines.append(schema.description)
    lines.append("")

    for k, v in jx.sort(schema.properties.items(), 0):
        full_name = k
        if v.type in ["object", "array", "nested"]:
            lines.append("##"+_md_code(full_name)+" Property")
            if v.description:
                lines.append(v.description)
            lines.append("")

            if v.type in ["object", "array", "nested"]:
                lines.extend(_inner(v.properties, full_name, "  "))
        else:
            lines.append("##"+_md_code(full_name)+" ("+v.type+")")
            if v.description:
                lines.append(v.description)

    return "\n".join(lines)
Пример #27
0
    def __init__(self, edge, query, limit):
        AggsDecoder.__init__(self, edge, query, limit)
        domain = self.domain = edge.domain
        self.sorted = None
        self.pull = pull_functions[STRING]

        # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
        # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)]
        edge_var = set(v.var for v in edge.value.vars())
        if query.sort:
            for s in query.sort:
                if not edge_var - set(v.var for v in s.value.vars()):
                    self.sorted = {1: "asc", -1: "desc"}[s.sort]
                    parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort})
                    edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts)
Пример #28
0
    def __init__(self, edge, query, limit):
        AggsDecoder.__init__(self, edge, query, limit)
        domain = self.domain = edge.domain

        # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
        # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)]
        edge_var = edge.value.vars()
        if query.sort:
            for s in query.sort:
                if not edge_var - s.value.vars():
                    self.sorted = {1: "asc", -1: "desc"}[s.sort]
                    parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort})
                    edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts)
        else:
            self.sorted = None
 def running_instances(self):
     # FIND THE BIGGEST, MOST EXPENSIVE REQUESTS
     instances = self._get_managed_instances()
     for r in instances:
         try:
             r.markup = self.price_lookup[r.instance_type, r.placement]
         except Exception as e:
             r.markup = self.price_lookup[r.instance_type, r.placement]
             Log.error("No pricing!!!", e)
     instances = jx.sort(instances, [{
         "value": "markup.type.utility",
         "sort": -1
     }, {
         "value": "markup.estimated_value",
         "sort": 1
     }])
     return instances
Пример #30
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     ParserElement._parse = self.previous_parse
     profile = jx.sort(
         [{
             "parser": text(parser),
             "cache_hits": cache,
             "matches": match,
             "failures": fail,
             "call_count": match + fail + cache,
             "total_parse": parse,
             "total_overhead": all - parse,
             "per_parse": parse / (match + fail),
             "per_overhead": (all - parse) / (match + fail + cache),
         } for parser, (cache, match, fail, parse, all) in timing.items()],
         {"total_parse": "desc"},
     )
     self.file.add_suffix(Date.now().format("%Y%m%d_%H%M%S")).write(
         convert.list2tab(profile))
Пример #31
0
 def _merge(*schemas):
     if len(schemas) == 1:
         return schemas[0]
     try:
         return OrderedDict(
             (k, _merge(*[ss for s in schemas for ss in [s.get(k)] if ss]))
             for k in jx.sort(set(k for s in schemas for k in s.keys()))
         )
     except Exception as e:
         e = Except.wrap(e)
         if "Expecting types to match" in e:
             raise e
         t = list(set(schemas))
         if len(t) == 1:
             return t[0]
         elif len(t) == 2 and STRING in t and NUMBER in t:
             return STRING
         else:
             Log.error("Expecting types to match {{types|json}}", types=t)
Пример #32
0
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = jx.sort(keys)

        try:
            command = (SQL_INSERT + quote_column(table_name) +
                       sql_iso(sql_list([quote_column(k) for k in keys])) +
                       SQL_VALUES + sql_list(
                           sql_iso(sql_list([quote_value(r[k]) for k in keys]))
                           for r in records))
            self.execute(command)
        except Exception as e:
            Log.error("problem with record: {{record}}",
                      record=records,
                      cause=e)
Пример #33
0
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = jx.sort(keys)

        try:
            command = (
                "INSERT INTO " + quote_column(table_name) +
                sql_iso(sql_list([quote_column(k) for k in keys])) +
                " VALUES " + sql_list([
                sql_iso(sql_list([quote_value(r[k]) for k in keys]))
                for r in records
            ])
            )
            self.execute(command)
        except Exception as e:
            Log.error("problem with record: {{record}}", record=records, cause=e)
Пример #34
0
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = jx.sort(keys)

        try:
            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in keys]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception as e:
            Log.error("problem with record: {{record}}",
                      record=records,
                      cause=e)
Пример #35
0
        def _schema_to_bq_schema(jx_path, es_path, schema):
            output = []
            nt = schema.get(NESTED_TYPE)
            if nt:
                schema = {NESTED_TYPE: nt}
            for t, sub_schema in jx.sort(schema.items(), 0):
                bqt = typed_to_bq_type.get(
                    t, {"field_type": "RECORD", "mode": "NULLABLE"}
                )
                full_name = es_path + escape_name(t)
                top_field = self._top_level_fields.get(text(full_name))
                if is_text(sub_schema):
                    new_field_type = json_type_to_bq_type.get(sub_schema, sub_schema)
                    if new_field_type != bqt["field_type"]:
                        # OVERRIDE TYPE
                        bqt = bqt.copy()
                        bqt["field_type"] = new_field_type
                    fields = ()
                else:
                    fields = _schema_to_bq_schema(jx_path + (t,), full_name, sub_schema)

                if top_field:
                    if fields:
                        Log.error("not expecting a structure")
                    if self._partition.field == top_field:
                        if bqt["field_type"] != "TIMESTAMP":
                            Log.error("Partition field must be of time type")
                    struct = SchemaField(name=top_field, fields=fields, **bqt)
                    top_fields.append(struct)
                elif not fields and bqt["field_type"] == "RECORD":
                    # THIS CAN HAPPEN WHEN WE MOVE A PRIMITIVE FIELD TO top_fields
                    pass
                else:
                    struct = SchemaField(
                        name=text(escape_name(t)), fields=fields, **bqt
                    )
                    output.append(struct)
            return output
Пример #36
0
    def test_scroll_query_table(self):
        data = wrap([{"a": "test" + text(i)} for i in range(10111)])
        expected = jx.sort(data, "a")

        test = wrap({
            "data": data,
            "query": {
                "from": TEST_TABLE,
                "select": ["a"],
                "limit": len(data),
                "chunk_size": 10000,
                "sort": "a",
            },
            "expecting_list": {
                "data": expected[:MAX_LIMIT]
            },  # DUMMY, TO ENSURE LOADED
        })
        self.utils.execute_tests(test)

        test.query.format = "table"
        test.query.sort = None
        test.query.destination = "url"
        result = http.post_json(
            url=self.utils.testing.query,
            json=test.query,
        )
        self.assertEqual(result.meta.format, "table")

        @self.retry(result.url)
        def get_content():
            content = http.get_json(result.url)
            self.assertEqual(content.header, ["a"])
            self.assertEqual(content.meta.format, "table")
            sorted_content = jx.sort(content.data, 0)
            sorted_expected = [(row.a, ) for row in expected]
            self.assertEqual(sorted_content, sorted_expected)
Пример #37
0
    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.columns, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            if column.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.tables, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.abs_columns
                if cc.es_column == column.es_column and cc.type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            else:
                result = self.default_es.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                r = result.aggregations.count
                count = result.hits.total
                cardinality = coalesce(r.value, r._nested.value, r.doc_count)
                multi = coalesce(r.multi.value, 1)
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": cardinality,
                            "cardinality": cardinality,
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts",
                             table=column.es_index,
                             field=column.es_column,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts",
                             field=column.es_index,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}",
                         field=column.names["."],
                         parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "clear": ".",
                        "where": {
                            "eq": {
                                "es_index": column.es_index
                            }
                        }
                    })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
Пример #38
0
 def done_count(self):
     self.edge.domain = self.domain = SimpleSetDomain(
         partitions=jx.sort(set(self.parts))
     )
     self.parts = None
     self.computed_domain = True
Пример #39
0
    def get_columns(self, table_name, column_name=None, after=None, timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}", time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=["."],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after)
                    else:
                        Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}", table=table_name, cause=e)

        return []
Пример #40
0
 def done_count(self):
     self.edge.allowNulls = False
     self.edge.domain = self.domain = SimpleSetDomain(
         partitions=jx.sort(set(self.parts))
     )
     self.parts = None
Пример #41
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not startswith_field(query['from'], self.sf.fact):
            Log.error("Expecting table, or some nested table")
        frum, query['from'] = query['from'], self
        table = self.sf.tables[relative_field(frum, self.sf.fact)]
        schema = table.schema
        query = QueryOp.wrap(query, table=table, schema=schema)
        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = "CREATE TABLE " + quote_column(new_table) + " AS "
        else:
            create_table = ""

        if query.groupby and query.format != "cube":
            op, index_to_columns = self._groupby_op(query, frum)
            command = create_table + op
        elif query.groupby:
            query.edges, query.groupby = query.groupby, query.edges
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
            query.edges, query.groupby = query.groupby, query.edges
        elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
        else:
            op = self._set_op(query, frum)
            return op

        result = self.db.query(command)

        if query.format == "container":
            output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name

            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0]))
                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    data=unwrap(data),
                    select=select,
                    meta={"format": "cube"}
                )

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif isinstance(e.value, TupleOp):
                        pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name],
                                        "push_child").pull
                        parts = [tuple(p(d) for p in pulls) for d in result.data]
                        domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(Data(
                        name=e.name,
                        allowNulls=allowNulls,
                        domain=domain
                    ))

                data = {}
                for si, s in enumerate(listwrap(query.select)):
                    if s.aggregate == "count":
                        data[s.name] = Matrix(dims=dims, zeros=0)
                    else:
                        data[s.name] = Matrix(dims=dims)

                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    meta={"format": "cube"},
                    edges=edges,
                    select=select,
                    data={k: v.cube for k, v in data.items()}
                )

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif isinstance(e.value, TupleOp):
                    pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = zip(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}

                    if query.sort[i].sort == -1:
                        domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True)))
                    else:
                        domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(Data(
                    name=e.name,
                    allowNulls=allowNulls,
                    domain=domain
                ))

            data_cubes = {}
            for si, s in enumerate(listwrap(query.select)):
                if s.aggregate == "count":
                    data_cubes[s.name] = Matrix(dims=dims, zeros=0)
                else:
                    data_cubes[s.name] = Matrix(dims=dims)

            r2c = index_to_coordinate(dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(row)

            if query.select == None:
                select = Null
            elif isinstance(query.select, list):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(
                meta={"format": "cube"},
                edges=edges,
                select=select,
                data={k: v.cube for k, v in data_cubes.items()}
            )
        elif query.format == "table" or (not query.format and query.groupby):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(
                meta={"format": "table"},
                header=column_names,
                data=data
            )
        elif query.format == "list" or (not query.edges and not query.groupby):
            if not query.edges and not query.groupby and any(listwrap(query.select).aggregate):
                if isinstance(query.select, list):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            if data[c.push_name] == None:
                                data[c.push_name] = c.pull(result.data[0])
                            elif isinstance(data[c.push_name], list):
                                data[c.push_name].append(c.pull(result.data[0]))
                            else:
                                data[c.push_name] = [data[c.push_name], c.pull(result.data[0])]
                        else:
                            data[c.push_name][c.push_child] = c.pull(result.data[0])

                    output = Data(
                        meta={"format": "value"},
                        data=data
                    )
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        if not data[s.push_child]:
                            data[s.push_child] = s.pull(result.data[0])
                        else:
                            data[s.push_child] += [s.pull(result.data[0])]
                    output = Data(
                        meta={"format": "value"},
                        data=unwrap(data)
                    )
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(
                    meta={"format": "list"},
                    data=data
                )
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
Пример #42
0
def int_list_packer(term, values):
    """
    return singletons, ranges and exclusions
    """
    DENSITY = 10  # a range can have holes, this is inverse of the hole density
    MIN_RANGE = 20  # min members before a range is allowed to be used

    singletons = set()
    ranges = []
    exclude = set()

    sorted = jx.sort(values)

    last = sorted[0]
    curr_start = last
    curr_excl = set()

    for v in sorted[1::]:
        if v <= last + 1:
            pass
        elif v - last > 3:
            # big step, how do we deal with it?
            if last == curr_start:
                # not a range yet, so just add as singlton
                singletons.add(last)
            elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
                # small ranges are singletons, sparse ranges are singletons
                singletons |= set(range(curr_start, last + 1))
                singletons -= curr_excl
            else:
                # big enough, and dense enough range
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
            curr_start = v
            curr_excl = set()
        else:
            if 1 + last - curr_start >= len(curr_excl) * DENSITY:
                # high density, keep track of excluded and continue
                add_me = set(range(last + 1, v))
                curr_excl |= add_me
            elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE:
                # not big enough, convert range to singletons
                new_singles = set(range(curr_start, last + 1)) - curr_excl
                singletons = singletons | new_singles

                curr_start = v
                curr_excl = set()
            else:
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
                curr_start = v
                curr_excl = set()
        last = v

    if last == curr_start:
        # not a range yet, so just add as singlton
        singletons.add(last)
    elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
        # small ranges are singletons, sparse ranges are singletons
        singletons |= set(range(curr_start, last + 1))
        singletons -= curr_excl
    else:
        # big enough, and dense enough range
        ranges.append({"gte": curr_start, "lte": last})
        exclude |= curr_excl

    if ranges:
        r = {"or": [{"range": {term: r}} for r in ranges]}
        if exclude:
            r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]}
        if singletons:
            return {"or": [
                {"terms": {term: jx.sort(singletons)}},
                r
            ]}
        else:
            return r
    else:
        return {"terms": {term: values}}
Пример #43
0
    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)