示例#1
0
    def new_leaves(self, column_name):
        """
        :param column_name:
        :return: ALL COLUMNS THAT START WITH column_name, INCLUDING DEEP COLUMNS
        """
        column_name = unnest_path(column_name)
        columns = self.columns
        all_paths = self.snowflake.sorted_query_paths

        output = {}
        for c in columns:
            if c.name == "_id" and column_name != "_id":
                continue
            if c.jx_type in OBJECTS:
                continue
            if c.cardinality == 0:
                continue
            for path in all_paths:
                if not startswith_field(unnest_path(relative_field(c.name, path)), column_name):
                    continue
                existing = output.get(path)
                if not existing:
                    output[path] = [c]
                    continue
                if len(path) > len(c.nested_path[0]):
                    continue
                if any("." + t + "." in c.es_column for t in (STRING_TYPE, NUMBER_TYPE, BOOLEAN_TYPE)):
                    # ELASTICSEARCH field TYPES ARE NOT ALLOWED
                    continue
                # ONLY THE DEEPEST COLUMN WILL BE CHOSEN
                output[path].append(c)
        return set(output.values())
示例#2
0
 def get_column_name(self, column):
     """
     RETURN THE COLUMN NAME, FROM THE PERSPECTIVE OF THIS SCHEMA
     :param column:
     :return: NAME OF column
     """
     return relative_field(column.name, query_path)
示例#3
0
    def leaves(self, column_name):
        """
        :param column_name:
        :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS
        """
        clean_name = unnest_path(column_name)

        if clean_name != column_name:
            clean_name = column_name
            cleaner = lambda x: x
        else:
            cleaner = unnest_path


        columns = self.columns
        # TODO: '.' IMPLIES ALL FIELDS FROM ABSOLUTE PERPECTIVE, ALL OTHERS ARE A RELATIVE PERSPECTIVE
        # TODO: HOW TO REFER TO FIELDS THAT MAY BE SHADOWED BY A RELATIVE NAME?
        for path in reversed(self.query_path) if clean_name == '.' else self.query_path:
            output = [
                c
                for c in columns
                if (
                    (c.name != "_id" or clean_name == "_id") and
                    (
                        (c.jx_type == EXISTS and column_name.endswith("." + EXISTS_TYPE)) or
                        c.jx_type not in OBJECTS or
                        (clean_name == '.' and c.cardinality == 0)
                    ) and
                    startswith_field(cleaner(relative_field(c.name, path)), clean_name)
                )
            ]
            if output:
                return set(output)
        return set()
示例#4
0
        def _worker(start):
            output = SchemaTree()
            root = parquet_schema_list[off.set]

            output.element = root
            max = start + coalesce(root.num_children, 0)

            if off.set == 0:
                if root.name not in ['.', 'schema', 'spark_schema', 'm', 'hive_schema', 'root']:  # some known root names
                    Log.warning("first SchemaElement is given name {{name|quote}}, name is ignored", name=root.name)
                root.name = '.'
                root.repetition_type = REQUIRED

            while off.set < max:
                off.set += 1
                child = _worker(off.set)
                parent = output
                path = relative_field(child.element.name, root.name)

                # path = split_field(relative_field(child.element.name, root.name))
                # for i, p in enumerate(path[:-1]):
                #     new_parent = parent.more[p] = SchemaTree()
                #     new_parent.element = SchemaElement(
                #         name=concat_field(root.name, join_field(path[:i+1])),
                #         repetition_type=REQUIRED
                #     )
                #     parent = new_parent
                # parent.more[path[-1]] = child
                parent.more[path] = child
            return output
示例#5
0
文件: stream.py 项目: rv404674/TUID
def needed(name, required):
    """
    RETURN SUBSET IF name IN REQUIRED
    """
    return [
        relative_field(r, name) if r and startswith_field(r, name) else None
        for r in required
    ]
示例#6
0
文件: decoders.py 项目: rv404674/TUID
    def __init__(self, edge, query, limit):
        AggsDecoder.__init__(self, edge, query, limit)
        if isinstance(edge.value, LeavesOp):
            prefix = edge.value.term.var
            flatter = lambda k: literal_field(relative_field(k, prefix))
        else:
            prefix = edge.value.var
            flatter = lambda k: relative_field(k, prefix)

        self.put, self.fields = transpose(*[
            (flatter(untype_path(c.names["."])), c.es_column)
            for c in query.frum.schema.leaves(prefix)
        ])

        self.domain = self.edge.domain = wrap({"dimension": {"fields": self.fields}})
        self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT)
        self.parts = list()
        self.key2index = {}
        self.computed_domain = False
示例#7
0
        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)
示例#8
0
文件: query.py 项目: rv404674/TUID
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if isinstance(edge, text_type):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([
                    {
                        "name": concat_field(prefix, literal_field(relative_field(untype_path(c.names["."]), prefix))),
                        "put": {"name": literal_field(untype_path(c.names["."]))},
                        "value": jx_expression(c.es_column, schema=schema),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for c in schema.leaves(prefix)
                ])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {"name": literal_field(untype_path(prefix))},
                    "value": jx_expression(prefix, schema=schema),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, text_type):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
示例#9
0
 def output(doc):
     acc = []
     for h in doc.inner_hits[name].hits.hits:
         i = h._nested.offset
         obj = Data()
         for f, v in h.fields.items():
             local_path = untype_path(relative_field(f, nested_path))
             obj[local_path] = unwraplist(v)
         # EXTEND THE LIST TO THE LENGTH WE REQUIRE
         for _ in range(len(acc), i+1):
             acc.append(None)
         acc[i] = expr(obj)
     return acc
示例#10
0
 def map_to_es(self):
     """
     RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME
     """
     output = {}
     for path in self.query_path:
         set_default(
             output,
             {
                 k: c.es_column
                 for c in self.columns
                 if c.jx_type not in STRUCT
                 for rel_name in [relative_field(c.name, path)]
                 for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)]
             }
         )
     return output
示例#11
0
 def map_to_es(self):
     """
     RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME
     """
     full_name = self.query_path
     return set_default(
         {
             relative_field(c.name, full_name): c.es_column
             for k, cs in self.lookup.items()
             # if startswith_field(k, full_name)
             for c in cs if c.jx_type not in STRUCT
         },
         {
             c.name: c.es_column
             for k, cs in self.lookup.items()
             # if startswith_field(k, full_name)
             for c in cs if c.jx_type not in STRUCT
         }
     )
示例#12
0
文件: meta.py 项目: rv404674/TUID
    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties)
        if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored {{names}}",
                names=[
                    ".".join((c.es_index, c.names['.']))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths
            for i in self.index_to_alias.get_domain(alias):
                self.alias_to_query_paths[i] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = jx_type(abs_column)
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass
示例#13
0
    def add(self, full_name, repetition_type, type):
        """
        :param full_name: dot delimited path to the property (use dot (".") for none)
        :param repetition_type: one of OPTIONAL or NESTED (REQUIRED is not possible)
        :param json_type: the json type to store
        :return:
        """
        base_name = self.element.name
        simple_name = relative_field(full_name, base_name)
        path = split_field(simple_name)
        output = self

        if len(path) == 0:
            return output._add_one('.', full_name, repetition_type, type)
        else:
            fname = base_name
            for p in path[:-1]:
                fname = concat_field(fname, p)
                n = output.more.get(p)
                output = n or output._add_one(p, fname, OPTIONAL, object)

            return output._add_one(path[-1], full_name, repetition_type, type)
示例#14
0
    def construct_docs(self, cursor, append, please_stop):
        """
        :param cursor: ITERATOR OF RECORDS
        :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT
        :return: (count, first, next, next_key)
        number of documents added
        the first document in the batch
        the first document of the next batch
        """
        null_values = set(self.settings.snowflake.null_values) | {None}

        count = 0
        rownum = 0
        columns = tuple(wrap(c) for c in self.schema.columns)
        with Timer("Downloading from MySQL"):
            curr_record = Null
            for rownum, row in enumerate(cursor):
                if please_stop:
                    Log.error("Got `please_stop` signal")

                nested_path = []
                next_record = None

                for c, value in zip(columns, row):
                    if value in null_values:
                        continue
                    if len(nested_path) < len(c.nested_path):
                        nested_path = unwrap(c.nested_path)
                        next_record = Data()
                    next_record[c.put] = value

                if len(nested_path) > 1:
                    path = nested_path[-2]
                    children = curr_record[path]
                    if children == None:
                        children = curr_record[path] = wrap([])
                    if len(nested_path) > 2:
                        parent_path = path
                        for path in list(reversed(nested_path[0:-2:])):
                            parent = children.last()
                            relative_path = relative_field(path, parent_path)
                            children = parent[relative_path]
                            if children == None:
                                children = parent[relative_path] = wrap([])
                            parent_path = path

                    children.append(next_record)
                    continue

                if curr_record == next_record:
                    Log.error("not expected")

                if curr_record:
                    append(curr_record["id"], count)
                    count += 1
                curr_record = next_record

            # DEAL WITH LAST RECORD
            if curr_record:
                append(curr_record["id"], count)
                count += 1

        Log.note("{{num}} documents ({{rownum}} db records)",
                 num=count,
                 rownum=rownum)
示例#15
0
    def _set_op(self, query, frum):
        # GET LIST OF COLUMNS
        frum_path = split_field(frum)
        primary_nested_path = join_field(frum_path[1:])
        vars_ = UNION([s.value.vars() for s in listwrap(query.select)])
        schema = self.sf.tables[primary_nested_path].schema

        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path, sub_table) in enumerate(self.sf.tables.items())
        }

        active_columns = {".": []}
        for cname, cols in schema.items():
            if any(startswith_field(cname, v) for v in vars_):
                for c in cols:
                    if c.type in STRUCT:
                        continue
                    nest = c.nested_path[0]
                    active = active_columns.get(nest)
                    if not active:
                        active = active_columns[nest] = []
                    active.append(c)

        for nested_path, s in self.sf.tables.items():
            for cname, cols in s.schema.items():
                if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc):
                    for c in cols:
                        if c.type in STRUCT:
                            continue
                        nest = c.nested_path[0]
                        active = active_columns.get(nest)
                        if not active:
                            active = active_columns[nest] = []
                        active.append(c)

        # ANY VARS MENTIONED WITH NO COLUMNS?
        for v in vars_:
            if not any(startswith_field(cname, v) for cname in schema.keys()):
                active_columns["."].append(Column(
                    names={".": v},
                    type="null",
                    es_column=".",
                    es_index=".",
                    nested_path=["."]
                ))

        # EVERY COLUMN, AND THE INDEX IT TAKES UP
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        index_to_uid = {}  # FROM NESTED PATH TO THE INDEX OF UID
        sql_selects = []  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path, sub_table) in enumerate(self.sf.tables.items())
            }

        sorts = []
        if query.sort:
            for s in query.sort:
                col = s.value.to_sql(schema)[0]
                for t, sql in col.sql.items():
                    json_type = sql_type_to_json_type[t]
                    if json_type in STRUCT:
                        continue
                    column_number = len(sql_selects)
                    # SQL HAS ABS TABLE REFERENCE
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(sql + " AS " + column_alias)
                    if s.sort == -1:
                        sorts.append(column_alias + " IS NOT NULL")
                        sorts.append(column_alias + " DESC")
                    else:
                        sorts.append(column_alias + " IS NULL")
                        sorts.append(column_alias)

        selects = []
        primary_doc_details = Data()
        # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH
        # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED
        for nested_path, sub_table in self.sf.tables.items():
            nested_doc_details = {
                "sub_table": sub_table,
                "children": [],
                "index_to_column": {},
                "nested_path": [nested_path]  # fake the real nested path, we only look at [0] anyway
            }

            # INSERT INTO TREE
            if not primary_doc_details:
                primary_doc_details = nested_doc_details
            else:
                def place(parent_doc_details):
                    if startswith_field(nested_path, parent_doc_details['nested_path'][0]):
                        for c in parent_doc_details['children']:
                            if place(c):
                                return True
                        parent_doc_details['children'].append(nested_doc_details)

                place(primary_doc_details)

            alias = nested_doc_details['alias'] = nest_to_alias[nested_path]

            if nested_path=="." and quoted_GUID in vars_:
                column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects)
                sql_select = alias + "." + quoted_GUID
                sql_selects.append(sql_select + " AS " + _make_column_name(column_number))
                index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                    push_name="_id",
                    push_column_name="_id",
                    push_column=0,
                    push_child=".",
                    sql=sql_select,
                    pull=get_column(column_number),
                    type="string",
                    column_alias=_make_column_name(column_number),
                    nested_path=[nested_path]           # fake the real nested path, we only look at [0] anyway
                )
                query.select = [s for s in listwrap(query.select) if s.name!="_id"]


            # WE ALWAYS ADD THE UID AND ORDER
            column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects)
            sql_select = alias + "." + quoted_UID
            sql_selects.append(sql_select + " AS " + _make_column_name(column_number))
            if nested_path !=".":
                index_to_column[column_number]=ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=[nested_path],            # fake the real nested path, we only look at [0] anyway
                    column_alias=_make_column_name(column_number)

                )
                column_number = len(sql_selects)
                sql_select = alias + "." + quote_table(ORDER)
                sql_selects.append(sql_select + " AS " + _make_column_name(column_number))
                index_to_column[column_number]=ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=[nested_path],            # fake the real nested path, we only look at [0] anyway
                    column_alias=_make_column_name(column_number)

                )

            # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM
            if nested_path not in active_columns:
                continue

            if len(active_columns[nested_path]) != 0:
                # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE
                si = 0
                for s in listwrap(query.select):
                    try:
                        column_number = len(sql_selects)
                        s.pull = get_column(column_number)
                        db_columns = s.value.to_sql(schema)

                        if isinstance(s.value, LeavesOp):
                            for column in db_columns:
                                if isinstance(column.nested_path, list):
                                    column.nested_path=column.nested_path[0]
                                if column.nested_path and column.nested_path!=nested_path:
                                    continue
                                for t, unsorted_sql in column.sql.items():
                                    json_type = sql_type_to_json_type[t]
                                    if json_type in STRUCT:
                                        continue
                                    column_number = len(sql_selects)
                                    # SQL HAS ABS TABLE REFERENCE
                                    column_alias = _make_column_name(column_number)
                                    if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1:
                                        continue
                                    selects.append(concat_field(alias, unsorted_sql))
                                    sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias)
                                    index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                                        push_name=literal_field(get_property_name(concat_field(s.name, column.name))),
                                        push_column_name=get_property_name(concat_field(s.name, column.name)),
                                        push_column=si,
                                        push_child=".",
                                        pull=get_column(column_number),
                                        sql=unsorted_sql,
                                        type=json_type,
                                        column_alias=column_alias,
                                        nested_path=[nested_path]           # fake the real nested path, we only look at [0] anyway
                                    )
                                    si += 1
                        else:
                            for column in db_columns:
                                if isinstance(column.nested_path, list):
                                    column.nested_path=column.nested_path[0]
                                if column.nested_path and column.nested_path!=nested_path:
                                    continue
                                for t, unsorted_sql in column.sql.items():
                                    json_type = sql_type_to_json_type[t]
                                    if json_type in STRUCT:
                                        continue
                                    column_number = len(sql_selects)
                                    # SQL HAS ABS TABLE REFERENCE
                                    column_alias = _make_column_name(column_number)
                                    if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1:
                                        continue
                                    selects.append(concat_field(alias, unsorted_sql))
                                    sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias)
                                    index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                                        push_name=s.name,
                                        push_column_name=s.name,
                                        push_column=si,
                                        push_child=column.name,
                                        pull=get_column(column_number),
                                        sql=unsorted_sql,
                                        type=json_type,
                                        column_alias=column_alias,
                                        nested_path=[nested_path]
                                        # fake the real nested path, we only look at [0] anyway
                                    )
                    finally:
                        si += 1
            elif startswith_field(nested_path, primary_nested_path):
                # ADD REQUIRED COLUMNS, FOR DEEP STUFF
                for ci, c in enumerate(active_columns[nested_path]):
                    if c.type in STRUCT:
                        continue

                    column_number = len(sql_selects)
                    nested_path = c.nested_path
                    unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column)
                    column_alias = _make_column_name(column_number)
                    if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1:
                        continue
                    selects.append(concat_field(alias, unsorted_sql))
                    sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias)
                    index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                        push_name=s.name,
                        push_column_name=s.name,
                        push_column=si,
                        push_child=relative_field(c.names["."], s.name),
                        pull=get_column(column_number),
                        sql=unsorted_sql,
                        type=c.type,
                        column_alias=column_alias,
                        nested_path=nested_path
                    )

        where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b
        unsorted_sql = self._make_sql_for_one_nest_in_set_op(
            ".",
            sql_selects,
            where_clause,
            active_columns,
            index_to_column
        )

        for n, _ in self.sf.tables.items():
            sorts.append(COLUMN + text_type(index_to_uid[n]))

        ordered_sql = (
            "SELECT * FROM (\n" +
            unsorted_sql +
            "\n)" +
            "\nORDER BY\n" + ",\n".join(sorts) +
            "\nLIMIT " + quote_value(query.limit)
        )
        self.db.create_new_functions()  #creating new functions: regexp
        result = self.db.query(ordered_sql)

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Null
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Null
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details['index_to_column'].items()
                    if index_to_column:
                        for i, c in index_to_column:
                            value = row[i]
                            if value == None:
                                continue
                            if value == '':
                                continue

                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path=join_field([c.push_name]+split_field(c.push_child))
                            else:           # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path=c.push_child

                            if relative_path == ".":
                                doc = value
                            elif doc is Null:
                                doc = Data()
                                doc[relative_path] = value
                            else:
                                doc[relative_path] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord)
                        if nested_value:
                            push_name = child_details['nested_path'][0]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path=relative_field(push_name, curr_nested_path)
                            else:           # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path="."

                            if relative_path == "." and doc is Null:
                                doc = nested_value
                            elif relative_path == ".":
                                doc[push_name] = unwraplist([v[push_name] for v in nested_value])
                            elif doc is Null:
                                doc = Data()
                                doc[relative_path] = unwraplist(nested_value)
                            else:
                                doc[relative_path] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output

        cols = tuple([i for i in index_to_column.values() if i.push_name != None])
        rows = list(reversed(unwrap(result.data)))
        if rows:
            row = rows.pop()
            data = _accumulate_nested(rows, row, primary_doc_details, None, None)
        else:
            data = result.data

        if query.format == "cube":
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)):
                    num_rows = len(result.data)
                    num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0
                    map_index_to_name = {c.push_column: c.push_column_name for c in cols}
                    temp_data = [[None]*num_rows for _ in range(num_cols)]
                    for rownum, d in enumerate(result.data):
                        for c in cols:
                            if c.push_child == ".":
                                temp_data[c.push_column][rownum] = c.pull(d)
                            else:
                                column = temp_data[c.push_column][rownum]
                                if column is None:
                                    column = temp_data[c.push_column][rownum] = Data()
                                column[c.push_child] = c.pull(d)
                    output = Data(
                        meta={"format": "cube"},
                        data={n: temp_data[c] for c, n in map_index_to_name.items()},
                        edges=[{
                            "name": "rownum",
                            "domain": {
                                "type": "rownum",
                                "min": 0,
                                "max": num_rows,
                                "interval": 1
                            }
                        }]
                    )
                    return output

            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                num_rows = len(data)
                map_index_to_name = {c.push_column: c.push_column_name for c in cols}
                temp_data = Data()
                for rownum, d in enumerate(data):
                    for k, v in d.items():
                        if temp_data[k] == None:
                            temp_data[k] = [None] * num_rows
                        temp_data[k][rownum] = v
                return Data(
                    meta={"format": "cube"},
                    data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()},
                    edges=[{
                        "name": "rownum",
                        "domain": {
                            "type": "rownum",
                            "min": 0,
                            "max": num_rows,
                            "interval": 1
                        }
                    }]
                )
            else:
                num_rows = len(data)
                map_index_to_name = {c.push_column: c.push_column_name for c in cols}
                temp_data = [data]

                return Data(
                    meta={"format": "cube"},
                    data={n: temp_data[c] for c, n in map_index_to_name.items()},
                    edges=[{
                        "name": "rownum",
                        "domain": {
                            "type": "rownum",
                            "min": 0,
                            "max": num_rows,
                            "interval": 1
                        }
                    }]
                )

        elif query.format == "table":
            for f, _ in self.sf.tables.items():
                if  frum.endswith(f):
                    num_column = MAX([c.push_column for c in cols])+1
                    header = [None]*num_column
                    for c in cols:
                        header[c.push_column] = c.push_column_name

                    output_data = []
                    for d in result.data:
                        row = [None] * num_column
                        for c in cols:
                            set_column(row, c.push_column, c.push_child, c.pull(d))
                        output_data.append(row)

                    return Data(
                        meta={"format": "table"},
                        header=header,
                        data=output_data
                    )
            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                num_rows = len(data)
                column_names= [None]*(max(c.push_column for c in cols) + 1)
                for c in cols:
                    column_names[c.push_column] = c.push_column_name

                temp_data = []
                for rownum, d in enumerate(data):
                    row =[None] * len(column_names)
                    for i, (k, v) in enumerate(sorted(d.items())):
                        for c in cols:
                            if k==c.push_name:
                                row[c.push_column] = v
                    temp_data.append(row)

                return Data(
                    meta={"format": "table"},
                    header=column_names,
                    data=temp_data
                )
            else:
                column_names = listwrap(query.select).name
                return Data(
                    meta={"format": "table"},
                    header=column_names,
                    data=[[d] for d in data]
                )

        else:
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)):
                    data = []
                    for d in result.data:
                        row = Data()
                        for c in cols:
                            if c.push_child == ".":
                                row[c.push_name] = c.pull(d)
                            elif c.num_push_columns:
                                tuple_value = row[c.push_name]
                                if not tuple_value:
                                    tuple_value = row[c.push_name] = [None] * c.num_push_columns
                                tuple_value[c.push_child] = c.pull(d)
                            elif not isinstance(query.select, list):   # select is value type
                                row[c.push_child]=c.pull(d)
                            else:
                                row[c.push_name][c.push_child] = c.pull(d)

                        data.append(row)

                    return Data(
                        meta={"format": "list"},
                        data=data
                    )

            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                temp_data=[]
                for rownum, d in enumerate(data):
                    row = {}
                    for k, v in d.items():
                        for c in cols:
                            if c.push_name==c.push_column_name==k:
                                    row[c.push_column_name] = v
                            elif c.push_name==k and c.push_column_name!=k:
                                    row[c.push_column_name] = v
                    temp_data.append(row)
                return Data(
                    meta={"format": "list"},
                    data=temp_data
                )
            else:
                return Data(
                    meta={"format": "list"},
                    data=data
                )
示例#16
0
文件: deep.py 项目: rv404674/TUID
def es_deepop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for i, f in enumerate(es_filters):
        script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema)
        set_default(f, script)

    if not wheres[1]:
        # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS
        more_filter = {
            "and": [
                es_filters[0],
                {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}}
            ]
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.names["."]: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.fields = []

    is_list = isinstance(query.select, list)
    new_select = FlatList()

    i = 0
    for s in listwrap(query.select):
        if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(s.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.jx_type == NESTED:
                        continue
                    es_query.fields += [c.es_column]
                c_name = untype_path(c.names[query_path])
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(s.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."},
                    "pull": get_pull_function(c)
                })
                i += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif isinstance(s.value, Variable):
            net_columns = schema.leaves(s.value.var)
            if not net_columns:
                new_select.append({
                    "name": s.name,
                    "nested_path": ".",
                    "put": {"name": s.name, "index": i, "child": "."},
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    pull = get_pull_function(n)
                    if n.nested_path[0] == ".":
                        if n.jx_type == NESTED:
                            continue
                        es_query.fields += [n.es_column]

                    # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(n.names[np])
                        if startswith_field(c_name, s.value.var):
                            child = relative_field(c_name, s.value.var)
                            break
                    else:
                        child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var)

                    new_select.append({
                        "name": s.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": s.name,
                            "index": i,
                            "child": child
                        }
                    })
            i += 1
        else:
            expr = s.value
            for v in expr.vars():
                for c in schema[v.var]:
                    if c.nested_path[0] == ".":
                        es_query.fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + s.name
            map_to_local = MapToLocal(schema)
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python())

            new_select.append({
                "name": s.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {"name": s.name, "index": i, "child": "."}
            })
            i += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []
    def get_more(please_stop):
        more.append(es_post(
            es,
            Data(
                query={"filtered": {"filter": more_filter}},
                fields=es_query.fields
            ),
            query.limit
        ))
    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t
    #</COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#17
0
def _indexer(columns, query_path):
    all_names = set(unnest_path(c.name) for c in columns) | {"."}

    lookup_leaves = {}  # ALL LEAF VARIABLES
    for full_name in all_names:
        for c in columns:
            cname = relative_field(c.name, query_path)
            nfp = unnest_path(cname)
            if (
                startswith_field(nfp, full_name) and
                c.es_type not in [EXISTS, OBJECT, NESTED] and
                (c.es_column != "_id" or full_name == "_id")
            ):
                cs = lookup_leaves.setdefault(full_name, set())
                cs.add(c)
                cs = lookup_leaves.setdefault(untype_path(full_name), set())
                cs.add(c)

    lookup_variables = {}  # ALL NOT-NESTED VARIABLES
    for full_name in all_names:
        for c in columns:
            cname = relative_field(c.name, query_path)
            nfp = unnest_path(cname)
            if (
                startswith_field(nfp, full_name) and
                c.es_type not in [EXISTS, OBJECT] and
                (c.es_column != "_id" or full_name == "_id") and
                startswith_field(c.nested_path[0], query_path)
            ):
                cs = lookup_variables.setdefault(full_name, set())
                cs.add(c)
                cs = lookup_variables.setdefault(untype_path(full_name), set())
                cs.add(c)

    relative_lookup = {}
    for c in columns:
        try:
            cname = relative_field(c.name, query_path)
            cs = relative_lookup.setdefault(cname, set())
            cs.add(c)

            ucname = untype_path(cname)
            cs = relative_lookup.setdefault(ucname, set())
            cs.add(c)
        except Exception as e:
            Log.error("Should not happen", cause=e)

    if query_path != ".":
        # ADD ABSOLUTE NAMES TO THE NAMESAPCE
        absolute_lookup, more_leaves, more_variables = _indexer(columns, ".")
        for k, cs in absolute_lookup.items():
            if k not in relative_lookup:
                relative_lookup[k] = cs
        for k, cs in more_leaves.items():
            if k not in lookup_leaves:
                lookup_leaves[k] = cs
        for k, cs in more_variables.items():
            if k not in lookup_variables:
                lookup_variables[k] = cs

    return relative_lookup, lookup_leaves, lookup_variables
示例#18
0
def es_setop(es, query):
    schema = query.frum.schema

    es_query, filters = es_query_template(schema.query_path[0])
    nested_filter = None
    set_default(filters[0], query.where.partial_eval().to_esfilter(schema))
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.stored_fields = FlatList()

    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    # columns = schema.columns
    # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".")

    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp) and isinstance(
                select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name,
                    relative_field(untype_path(c.names["."]), term.var))
                if c.jx_type == NESTED:
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        },
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                elif c.nested_path[0] != ".":
                    pass  # THE NESTED PARENT WILL CAPTURE THIS
                else:
                    es_query.stored_fields += [c.es_column]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        }
                    })
                    put_index += 1
        elif isinstance(select.value, Variable):
            s_column = select.value.var
            # LEAVES OF OBJECT
            leaves = schema.leaves(s_column)
            nested_selects = {}
            if leaves:
                if s_column == '.':
                    # PULL ALL SOURCE
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                        "pull": get_pull_source(".")
                    })
                elif any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    es_query.stored_fields = ["_source"]
                    for c in leaves:
                        if len(
                                c.nested_path
                        ) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES
                            jx_name = untype_path(c.names["."])
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": relative_field(jx_name, s_column)
                                },
                                "pull":
                                get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        if len(c.nested_path) == 1:
                            jx_name = untype_path(c.names["."])
                            if c.jx_type == NESTED:
                                es_query.stored_fields = ["_source"]
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child":
                                        relative_field(jx_name, s_column)
                                    },
                                    "pull":
                                    get_pull_source(c.es_column)
                                })

                            else:
                                es_query.stored_fields += [c.es_column]
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child":
                                        relative_field(jx_name, s_column)
                                    }
                                })
                        else:
                            if not nested_filter:
                                where = filters[0].copy()
                                nested_filter = [where]
                                for k in filters[0].keys():
                                    filters[0][k] = None
                                set_default(
                                    filters[0],
                                    es_and([where, es_or(nested_filter)]))

                            nested_path = c.nested_path[0]
                            if nested_path not in nested_selects:
                                where = nested_selects[nested_path] = Data()
                                nested_filter += [where]
                                where.nested.path = nested_path
                                where.nested.query.match_all = {}
                                where.nested.inner_hits._source = False
                                where.nested.inner_hits.stored_fields += [
                                    c.es_column
                                ]

                                child = relative_field(
                                    untype_path(c.names[schema.query_path[0]]),
                                    s_column)
                                pull = accumulate_nested_doc(
                                    nested_path,
                                    Variable(
                                        relative_field(
                                            s_column,
                                            unnest_path(nested_path))))
                                new_select.append({
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child
                                    },
                                    "pull": pull
                                })
                            else:
                                nested_selects[
                                    nested_path].nested.inner_hits.stored_fields += [
                                        c.es_column
                                    ]
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
            put_index += 1
        else:
            painless = select.value.partial_eval().to_es_script(schema)
            es_query.script_fields[literal_field(select.name)] = es_script(
                painless.script(schema))
            new_select.append({
                "name":
                select.name,
                "pull":
                jx_expression_to_function("fields." +
                                          literal_field(select.name)),
                "put": {
                    "name": select.name,
                    "index": put_index,
                    "child": "."
                }
            })
            put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif isinstance(n.value, Variable):
            if es_query.stored_fields[0] == "_source":
                es_query.stored_fields = ["_source"]
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter"):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#19
0
文件: setop.py 项目: rv404674/TUID
def es_setop(es, query):
    schema = query.frum.schema

    es_query, filters = es_query_template(schema.query_path[0])
    nested_filter = None
    set_default(filters[0], query.where.partial_eval().to_esfilter(schema))
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.stored_fields = FlatList()

    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    # columns = schema.columns
    # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".")

    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var))
                if c.jx_type == NESTED:
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."},
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                elif c.nested_path[0] != ".":
                    pass  # THE NESTED PARENT WILL CAPTURE THIS
                else:
                    es_query.stored_fields += [c.es_column]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."}
                    })
                    put_index += 1
        elif isinstance(select.value, Variable):
            s_column = select.value.var
            # LEAVES OF OBJECT
            leaves = schema.leaves(s_column)
            nested_selects = {}
            if leaves:
                if s_column == '.':
                    # PULL ALL SOURCE
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": put_index, "child": "."},
                        "pull": get_pull_source(".")
                    })
                elif any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    es_query.stored_fields = ["_source"]
                    for c in leaves:
                        if len(c.nested_path) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES
                            jx_name = untype_path(c.names["."])
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)},
                                "pull": get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        if len(c.nested_path) == 1:
                            jx_name = untype_path(c.names["."])
                            if c.jx_type == NESTED:
                                es_query.stored_fields = ["_source"]
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)},
                                    "pull": get_pull_source(c.es_column)
                                })

                            else:
                                es_query.stored_fields += [c.es_column]
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}
                                })
                        else:
                            if not nested_filter:
                                where = filters[0].copy()
                                nested_filter = [where]
                                for k in filters[0].keys():
                                    filters[0][k] = None
                                set_default(
                                    filters[0],
                                    es_and([where, es_or(nested_filter)])
                                )

                            nested_path = c.nested_path[0]
                            if nested_path not in nested_selects:
                                where = nested_selects[nested_path] = Data()
                                nested_filter += [where]
                                where.nested.path = nested_path
                                where.nested.query.match_all = {}
                                where.nested.inner_hits._source = False
                                where.nested.inner_hits.stored_fields += [c.es_column]

                                child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column)
                                pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path))))
                                new_select.append({
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child
                                    },
                                    "pull": pull
                                })
                            else:
                                nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column]
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
            put_index += 1
        else:
            painless = select.value.partial_eval().to_es_script(schema)
            es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema))
            new_select.append({
                "name": select.name,
                "pull": jx_expression_to_function("fields." + literal_field(select.name)),
                "put": {"name": select.name, "index": put_index, "child": "."}
            })
            put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif isinstance(n.value, Variable):
            if es_query.stored_fields[0] == "_source":
                es_query.stored_fields = ["_source"]
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter"):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#20
0
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if is_text(edge):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = list_to_data([
                    {  # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE
                        "name":
                        concat_field(
                            prefix,
                            literal_field(
                                relative_field(untype_path(c.name), prefix))),
                        "put": {
                            "name": literal_field(untype_path(c.name))
                        },
                        "value":
                        jx_expression(c.es_column, schema=schema),
                        "allowNulls":
                        True,
                        "domain": {
                            "type": "default"
                        }
                    } for c in schema.leaves(prefix)
                ])
                return output
            else:
                return list_to_data([{
                    "name": untype_path(prefix),
                    "put": {
                        "name": literal_field(untype_path(prefix))
                    },
                    "value": LeavesOp(Variable(prefix)),
                    "allowNulls": True,
                    "dim": dim_index,
                    "domain": {
                        "type": "default"
                    }
                }])

        return list_to_data([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = to_data(edge)
        if (edge.domain and edge.domain.type != "default"):
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        return list_to_data([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": {
                "type": "default"
            }
        }])
示例#21
0
def get_selects(query):
    schema = query.frum.schema
    split_select = {".": ESSelect(".")}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()
    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select(".").set_op = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                        "pull": get_pull_source(c.es_column),
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select(".").set_op = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_source("."),
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select(".").set_op = True
                    for c in leaves:
                        if (
                                len(c.nested_path) == 1
                        ):  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n)
                                for n in split_field(c.name))
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name":
                                    select.name,
                                    "index":
                                    put_index,
                                    "child":
                                    untype_path(
                                        relative_field(pre_child, s_column)),
                                },
                                "pull":
                                get_pull_source(c.es_column),
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": ".",
                                    },
                                    "pull":
                                    lambda row: row._id,
                                })
                            elif c.jx_type == NESTED:
                                get_select(".").set_op = True
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                    "pull":
                                    get_pull_source(c.es_column),
                                })
                            else:
                                get_select(c_nested_path).fields.append(
                                    c.es_column)
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name,
                                                   schema.query_path[0])),
                                s_column,
                            )
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(
                                        s_column, unnest_path(c_nested_path))),
                            )
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child,
                                },
                                "pull": pull,
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value,
                                                     schema,
                                                     lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[first(script)].partial_eval().to_es_script(
                        schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function("fields." +
                                              literal_field(select.name)),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1
    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select(".").set_op:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")
    return new_select, split_select
示例#22
0
def es_deepop(es, query):
    schema = query.frum.schema
    columns = schema.columns
    query_path = schema.query_path

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for i, f in enumerate(es_filters):
        script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema)
        set_default(f, script)

    if not wheres[1]:
        more_filter = {
            "bool": {
                "must":
                [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)],
                "must_not": {
                    "nested": {
                        "path": query_path,
                        "query": {
                            "match_all": {}
                        }
                    }
                }
            }
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.names["."]: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.stored_fields = []

    is_list = isinstance(query.select, list)
    new_select = FlatList()

    i = 0
    for s in listwrap(query.select):
        if isinstance(s.value, LeavesOp) and isinstance(
                s.value.term, Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(s.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.type == NESTED:
                        continue
                    es_query.stored_fields += [c.es_column]
                c_name = untype_path(c.names[query_path])
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(s.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {
                        "name": concat_field(s.name, literal_field(c_name)),
                        "index": i,
                        "child": "."
                    },
                    "pull": get_pull_function(c)
                })
                i += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(
                        ".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif isinstance(s.value, Variable):
            net_columns = schema.leaves(s.value.var)
            if not net_columns:
                new_select.append({
                    "name": s.name,
                    "nested_path": ".",
                    "put": {
                        "name": s.name,
                        "index": i,
                        "child": "."
                    },
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    pull = get_pull_function(n)
                    if n.nested_path[0] == ".":
                        if n.type == NESTED:
                            continue
                        es_query.stored_fields += [n.es_column]

                    # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(n.names[np])
                        if startswith_field(c_name, s.value.var):
                            child = relative_field(c_name, s.value.var)
                            break
                    else:
                        child = relative_field(
                            untype_path(n.names[n.nested_path[0]]),
                            s.value.var)

                    new_select.append({
                        "name": s.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": s.name,
                            "index": i,
                            "child": child
                        }
                    })
            i += 1
        else:
            expr = s.value
            for v in expr.vars():
                for c in schema[v]:
                    if c.nested_path[0] == ".":
                        es_query.stored_fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + s.name
            map_to_local = {
                untype_path(k): get_pull(cc)
                for k, c in schema.lookup.items() for cc in c
                if cc.type not in STRUCT
            }
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = compile_expression(
                expr.map(map_to_local).to_python())

            new_select.append({
                "name": s.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {
                    "name": s.name,
                    "index": i,
                    "child": "."
                }
            })
            i += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es_post(
                es,
                Data(query=more_filter, stored_fields=es_query.stored_fields),
                query.limit))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    #</COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#23
0
def es_deepop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for f, w in zip_longest(es_filters, wheres):
        script = ES52[AndOp(w).partial_eval()].to_es(schema)
        set_default(f, script)

    if not wheres[1]:
        # INCLUDE DOCS WITH NO NESTED DOCS
        more_filter = {
            "bool": {
                "filter": [AndOp(wheres[0]).partial_eval().to_es(schema)],
                "must_not": {
                    "nested": {
                        "path": query_path,
                        "query": MATCH_ALL
                    }
                }
            }
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.name: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.stored_fields = []

    is_list = is_list_(query.select)
    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(select.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.jx_type in INTERNAL:
                        continue
                    es_query.stored_fields += [c.es_column]
                c_name = untype_path(relative_field(c.name, query_path))
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(select.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {
                        "name": concat_field(select.name,
                                             literal_field(c_name)),
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_function(c)
                })
                put_index += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(
                        ".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif is_op(select.value, Variable):
            net_columns = schema.leaves(select.value.var)
            if not net_columns:
                new_select.append({
                    "name": select.name,
                    "nested_path": ".",
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    if n.nested_path[0] == ".":
                        if n.jx_type == NESTED:
                            continue
                        es_query.stored_fields += [n.es_column]

                    if len(n.nested_path[0]) > len(query_path):
                        # SELECTING INNER PROPERTIES IS NOT ALLOWED
                        continue
                    # WE MUST FIGURE OUT WHICH NAMESPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(relative_field(n.name, np))
                        if startswith_field(c_name, select.value.var):
                            child = relative_field(c_name, select.value.var)
                            break
                    else:
                        raise Log.error("Not expected")

                    pull = get_pull_function(n)
                    new_select.append({
                        "name": select.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": child
                        }
                    })
            put_index += 1
        else:
            expr = select.value
            for v in expr.vars():
                for c in schema[v.var]:
                    if c.nested_path[0] == ".":
                        es_query.stored_fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + select.name
            map_to_local = MapToLocal(schema)
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = jx_expression_to_function(
                expr.map(map_to_local))

            new_select.append({
                "name": select.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {
                    "name": select.name,
                    "index": put_index,
                    "child": "."
                }
            })
            put_index += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es.search(
                Data(query=more_filter, stored_fields=es_query.stored_fields)))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es.search(es_query)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    # </COMPLICATED>

    try:
        formatter, row_formatter, mime_type = set_formatters[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#24
0
    def _edges_op(self, query, frum):
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        outer_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        frum_path = split_field(frum)
        base_table = join_field(frum_path[0:1])
        path = join_field(frum_path[1:])
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path,
                    sub_table) in enumerate(self.sf.tables.items())
        }

        schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema

        tables = []
        for n, a in nest_to_alias.items():
            if startswith_field(path, n):
                tables.append({"nest": n, "alias": a})
        tables = jx.sort(tables, {"value": {"length": "nest"}})

        from_sql = join_field(
            [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias
        previous = tables[0]
        for t in tables[1::]:
            from_sql += ("\nLEFT JOIN\n" +
                         quote_table(concat_field(base_table, t.nest)) + " " +
                         t.alias + " ON " + t.alias + "." + PARENT + " = " +
                         previous.alias + "." + UID)

        # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH
        ons = []
        join_types = []
        wheres = []
        not_ons = ["__exists__ IS NULL"]
        groupby = []
        not_groupby = []
        orderby = []
        domains = []
        select_clause = [
            "1 __exists__"  # USED TO DISTINGUISH BETWEEN NULL-BECAUSE-LEFT-JOIN OR NULL-BECAUSE-NULL-VALUE
        ]

        for edge_index, query_edge in enumerate(query.edges):
            edge_alias = "e" + text_type(edge_index)

            if query_edge.value:
                edge_values = [
                    p for c in query_edge.value.to_sql(schema).sql
                    for p in c.items()
                ]

            elif not query_edge.value and any(
                    query_edge.domain.partitions.where):
                case = "CASE "
                for pp, p in enumerate(query_edge.domain.partitions):
                    w = p.where.to_sql(schema)[0].sql.b
                    t = quote_value(pp)
                    case += " WHEN " + w + " THEN " + t
                case += " ELSE NULL END "  # quote value with length of partitions
                edge_values = [("n", case)]

            elif query_edge.range:
                edge_values = query_edge.range.min.to_sql(schema)[0].sql.items(
                ) + query_edge.range.max.to_sql(schema)[0].sql.items()
示例#25
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    split_select = {".": ESSelect('.')}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select('.').use_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        },
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        }
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select('.').use_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_source(".")
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select('.').use_source = True
                    for c in leaves:
                        if len(
                                c.nested_path
                        ) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n)
                                for n in split_field(c.name))
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name":
                                    select.name,
                                    "index":
                                    put_index,
                                    "child":
                                    untype_path(
                                        relative_field(pre_child, s_column))
                                },
                                "pull":
                                get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": "."
                                    },
                                    "pull":
                                    lambda row: row._id
                                })
                            elif c.jx_type == NESTED:
                                get_select('.').use_source = True
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column))
                                    },
                                    "pull":
                                    get_pull_source(c.es_column)
                                })
                            else:
                                get_select(c_nested_path).fields.append(
                                    c.es_column)
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column))
                                    }
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name,
                                                   schema.query_path[0])),
                                s_column)
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(
                                        s_column, unnest_path(c_nested_path))))
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child
                                },
                                "pull": pull
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value,
                                                     schema,
                                                     lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script":
                    text_type(Painless[first(
                        script)].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function("fields." +
                                              literal_field(select.name)),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
                put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select('.').use_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=DEBUG) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    # Log.note("{{output}}", output=T)

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#26
0
def get_pull(column):
    if column.nested_path[0] == ".":
        return concat_field("fields", literal_field(column.es_column))
    else:
        rel_name = relative_field(column.es_column, column.nested_path[0])
        return concat_field("_inner", rel_name)
示例#27
0
        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Null
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Null
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details['index_to_column'].items()
                    if index_to_column:
                        for i, c in index_to_column:
                            value = row[i]
                            if value == None:
                                continue
                            if value == '':
                                continue

                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path=join_field([c.push_name]+split_field(c.push_child))
                            else:           # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path=c.push_child

                            if relative_path == ".":
                                doc = value
                            elif doc is Null:
                                doc = Data()
                                doc[relative_path] = value
                            else:
                                doc[relative_path] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord)
                        if nested_value:
                            push_name = child_details['nested_path'][0]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path=relative_field(push_name, curr_nested_path)
                            else:           # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path="."

                            if relative_path == "." and doc is Null:
                                doc = nested_value
                            elif relative_path == ".":
                                doc[push_name] = unwraplist([v[push_name] for v in nested_value])
                            elif doc is Null:
                                doc = Data()
                                doc[relative_path] = unwraplist(nested_value)
                            else:
                                doc[relative_path] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output
示例#28
0
        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id,
                               parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Data()
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and
                                      row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO
                    output = unwraplist(output)
                    return output if output else None

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Data()
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    if isinstance(query.select, list) or isinstance(
                            query.select.value, LeavesOp):
                        # ASSIGN INNER PROPERTIES
                        for i, c in nested_doc_details[
                                'index_to_column'].items():
                            value = row[i]
                            if value == None:
                                continue
                            if value == '':
                                continue

                            relative_path = relative_field(
                                concat_field(c.push_name, c.push_child),
                                curr_nested_path)
                            if relative_path == ".":
                                doc = value
                            else:
                                doc[relative_path] = value
                    else:
                        # ASSIGN INNER PROPERTIES
                        for i, c in nested_doc_details[
                                'index_to_column'].items():
                            value = row[i]
                            if value is not None:
                                relative_path = relative_field(
                                    c.push_child, curr_nested_path)
                                if relative_path == ".":
                                    doc = value
                                else:
                                    doc[relative_path] = value
                    output.append(doc)

                # ASSIGN NESTED ARRAYS
                for child_details in nested_doc_details['children']:
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(
                            rows, row, child_details, doc_id, id_coord)
                        if nested_value is not None:
                            path = child_details['nested_path'][0]
                            doc[path] = nested_value

                try:
                    row = rows.pop()
                except IndexError:
                    output = unwraplist(output)
                    return output if output else None
示例#29
0
        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query("SELECT * FROM " +
                              quote_column(position.name, position.schema) +
                              " LIMIT 1")

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return

            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            # INNER OBJECTS
            referenced_tables = list(
                jx.groupby(
                    jx.filter(
                        relations, {
                            "eq": {
                                "table.name": position.name,
                                "table.schema": position.schema
                            }
                        }), "constraint.name"))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text_type(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path
                })

                # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)])
                # HANDLE THE COMMON *id SUFFIX
                name = []
                for a, b in zip(constraint_columns.column.name,
                                constraint_columns.referenced.table.name):
                    if a.startswith(b):
                        name.append(b)
                    elif a.endswith("_id"):
                        name.append(a[:-3])
                    else:
                        name.append(a)
                referenced_column_path = join_field(
                    split_field(path) + ["/".join(name)])
                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                # insert into nested1 VALUES (100, 10, 'aaa', -1);
                # id.about.time.nested1 .ref=10
                # id.about.time.nested1 .ref.name
                for col in columns:
                    if col.table.name == constraint_columns[
                            0].referenced.table.name and col.table.schema == constraint_columns[
                                0].referenced.table.schema:
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema:
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(position=copy(constraint_columns[0].referenced.table),
                         path=referenced_column_path,
                         nested_path=nested_path,
                         done_relations=copy(done_relations),
                         no_nested_docs=no_nested_docs))

            # NESTED OBJECTS
            if not no_nested_docs:
                for g, constraint_columns in jx.groupby(
                        jx.filter(
                            relations, {
                                "eq": {
                                    "referenced.table.name": position.name,
                                    "referenced.table.schema": position.schema
                                }
                            }), "constraint.name"):
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    referenced_column_path = join_field(
                        split_field(path) + ["/".join(many_table)])
                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    # if new_path not in self.settings.include:
                    #     Log.note("Exclude nested path {{path}}", path=new_path)
                    #     continue
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text_type(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default({}, g, {
                            "children": True,
                            "join_columns": constraint_columns,
                            "path": path,
                            "nested_path": nested_path
                        }))
                    # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name
                    for col in columns:
                        if col.table.name == constraint_columns[
                                0].table.name and col.table.schema == constraint_columns[
                                    0].table.schema:
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None
                                })

                    todo.append(
                        Data(position=constraint_columns[0].table,
                             path=referenced_column_path,
                             nested_path=new_nested_path,
                             done_relations=copy(done_relations),
                             no_nested_docs=no_nested_docs))
示例#30
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not startswith_field(query['from'], self.sf.fact):
            Log.error("Expecting table, or some nested table")
        frum, query['from'] = query['from'], self
        table = self.sf.tables[relative_field(frum, self.sf.fact)]
        schema = table.schema
        query = QueryOp.wrap(query, self, self.namespace)
        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = "CREATE TABLE " + quote_column(new_table) + " AS "
        else:
            create_table = ""

        if query.groupby and query.format != "cube":
            op, index_to_columns = self._groupby_op(query, frum)
            command = create_table + op
        elif query.groupby:
            query.edges, query.groupby = query.groupby, query.edges
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
            query.edges, query.groupby = query.groupby, query.edges
        elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
        else:
            op = self._set_op(query, frum)
            return op

        result = self.db.query(command)

        if query.format == "container":
            output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name

            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0]))
                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    data=unwrap(data),
                    select=select,
                    meta={"format": "cube"}
                )

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif is_op(e.value, TupleOp):
                        pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name],
                                        "push_child").pull
                        parts = [tuple(p(d) for p in pulls) for d in result.data]
                        domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(Data(
                        name=e.name,
                        allowNulls=allowNulls,
                        domain=domain
                    ))

                data = {}
                for si, s in enumerate(listwrap(query.select)):
                    if s.aggregate == "count":
                        data[s.name] = Matrix(dims=dims, zeros=0)
                    else:
                        data[s.name] = Matrix(dims=dims)

                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    meta={"format": "cube"},
                    edges=edges,
                    select=select,
                    data={k: v.cube for k, v in data.items()}
                )

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif is_op(e.value, TupleOp):
                    pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = zip(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}

                    if query.sort[i].sort == -1:
                        domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True)))
                    else:
                        domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(Data(
                    name=e.name,
                    allowNulls=allowNulls,
                    domain=domain
                ))

            data_cubes = {}
            for si, s in enumerate(listwrap(query.select)):
                if s.aggregate == "count":
                    data_cubes[s.name] = Matrix(dims=dims, zeros=0)
                else:
                    data_cubes[s.name] = Matrix(dims=dims)

            r2c = index_to_coordinate(dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(row)

            if query.select == None:
                select = Null
            elif is_list(query.select):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(
                meta={"format": "cube"},
                edges=edges,
                select=select,
                data={k: v.cube for k, v in data_cubes.items()}
            )
        elif query.format == "table" or (not query.format and query.groupby):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(
                meta={"format": "table"},
                header=column_names,
                data=data
            )
        elif query.format == "list" or (not query.edges and not query.groupby):
            if not query.edges and not query.groupby and any(listwrap(query.select).aggregate):
                if is_list(query.select):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            if data[c.push_name] == None:
                                data[c.push_name] = c.pull(result.data[0])
                            elif is_list(data[c.push_name]):
                                data[c.push_name].append(c.pull(result.data[0]))
                            else:
                                data[c.push_name] = [data[c.push_name], c.pull(result.data[0])]
                        else:
                            data[c.push_name][c.push_child] = c.pull(result.data[0])

                    output = Data(
                        meta={"format": "value"},
                        data=data
                    )
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        if not data[s.push_child]:
                            data[s.push_child] = s.pull(result.data[0])
                        else:
                            data[s.push_child] += [s.pull(result.data[0])]
                    output = Data(
                        meta={"format": "value"},
                        data=unwrap(data)
                    )
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(
                    meta={"format": "list"},
                    data=data
                )
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
    def _edges_op(self, query, frum):
        query = query.copy()  # WE WILL BE MARKING UP THE QUERY
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        outer_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        base_table, path = tail_field(frum)
        nest_to_alias = {
            nested_path: quote_column("__" + unichr(ord('a') + i) + "__")
            for i, (nested_path,
                    sub_table) in enumerate(self.sf.tables.items())
        }

        schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema

        tables = []
        for n, a in nest_to_alias.items():
            if startswith_field(path, n):
                tables.append({"nest": n, "alias": a})
        tables = jx.sort(tables, {"value": {"length": "nest"}})

        from_sql = quote_column(
            join_field([base_table] +
                       split_field(tables[0].nest))) + tables[0].alias
        for previous, t in zip(tables, tables[1::]):
            from_sql += (SQL_LEFT_JOIN +
                         quote_column(concat_field(base_table, t.nest)) +
                         t.alias + SQL_ON +
                         join_column(t.alias, quoted_PARENT) + " = " +
                         join_column(previous.alias, quoted_UID))

        main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b

        # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH
        ons = []
        join_types = []
        wheres = []
        null_ons = [EXISTS_COLUMN + SQL_IS_NULL]
        groupby = []
        null_groupby = []
        orderby = []
        domains = []

        select_clause = [SQL_ONE + EXISTS_COLUMN] + [
            quote_column(c.es_column) for c in self.sf.tables["."].columns
        ]

        for edge_index, query_edge in enumerate(query.edges):
            edge_alias = quote_column("e" + text_type(edge_index))

            if query_edge.value:
                edge_values = [
                    p for c in query_edge.value.to_sql(schema).sql
                    for p in c.items()
                ]

            elif not query_edge.value and any(
                    query_edge.domain.partitions.where):
                case = SQL_CASE
                for pp, p in enumerate(query_edge.domain.partitions):
                    w = p.where.to_sql(schema)[0].sql.b
                    t = quote_value(pp)
                    case += SQL_WHEN + w + SQL_THEN + t
                case += SQL_ELSE + SQL_NULL + SQL_END  # quote value with length of partitions
                edge_values = [("n", case)]

            elif query_edge.range:
                edge_values = query_edge.range.min.to_sql(schema)[0].sql.items(
                ) + query_edge.range.max.to_sql(schema)[0].sql.items()
    def to_sql(self, schema, not_null=False, boolean=False, many=True):
        var_name = self.var
        if var_name == GUID:
            return wrap([{
                "name": ".",
                "sql": {
                    "s": quoted_GUID
                },
                "nested_path": ROOT_PATH
            }])
        cols = schema.leaves(var_name)
        if not cols:
            # DOES NOT EXIST
            return wrap([{
                "name": ".",
                "sql": {
                    "0": SQL_NULL
                },
                "nested_path": ROOT_PATH
            }])
        acc = {}
        if boolean:
            for col in cols:
                cname = relative_field(col.name, var_name)
                nested_path = col.nested_path[0]
                if col.type == OBJECT:
                    value = SQL_TRUE
                elif col.type == BOOLEAN:
                    value = quote_column(col.es_column)
                else:
                    value = quote_column(col.es_column) + SQL_IS_NOT_NULL
                tempa = acc.setdefault(nested_path, {})
                tempb = tempa.setdefault(get_property_name(cname), {})
                tempb["b"] = value
        else:
            for col in cols:
                cname = relative_field(col.name, var_name)
                if col.jx_type == OBJECT:
                    prefix = self.var + "."
                    for cn, cs in schema.items():
                        if cn.startswith(prefix):
                            for child_col in cs:
                                tempa = acc.setdefault(
                                    child_col.nested_path[0], {})
                                tempb = tempa.setdefault(
                                    get_property_name(cname), {})
                                tempb[json_type_to_sql_type[
                                    col.type]] = quote_column(
                                        child_col.es_column)
                else:
                    nested_path = col.nested_path[0]
                    tempa = acc.setdefault(nested_path, {})
                    tempb = tempa.setdefault(get_property_name(cname), {})
                    tempb[json_type_to_sql_type[col.jx_type]] = quote_column(
                        col.es_column)

        return wrap([{
            "name": cname,
            "sql": types,
            "nested_path": nested_path
        } for nested_path, pairs in acc.items()
                     for cname, types in pairs.items()])
示例#33
0
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if isinstance(edge, text_type):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([{
                    "name":
                    concat_field(
                        prefix,
                        literal_field(
                            relative_field(untype_path(c.names["."]),
                                           prefix))),
                    "put": {
                        "name": literal_field(untype_path(c.names["."]))
                    },
                    "value":
                    jx_expression(c.es_column),
                    "allowNulls":
                    True,
                    "domain": {
                        "type": "default"
                    }
                } for c in schema.leaves(prefix)])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {
                        "name": literal_field(untype_path(prefix))
                    },
                    "value": jx_expression(prefix),
                    "allowNulls": True,
                    "dim": dim_index,
                    "domain": {
                        "type": "default"
                    }
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge),
            "allowNulls": True,
            "dim": dim_index,
            "domain": {
                "type": "default"
            }
        }])
    else:
        edge = wrap(edge)
        if (edge.domain
                and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, text_type):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value),
            "allowNulls": True,
            "dim": dim_index,
            "domain": {
                "type": "default"
            }
        }])
示例#34
0
def get_pull(column):
    if column.nested_path[0] == ".":
        return concat_field("fields", literal_field(column.es_column))
    else:
        rel_name = relative_field(column.es_column, column.nested_path[0])
        return concat_field("_inner", rel_name)
示例#35
0
    def _set_op(self, query, frum):
        # GET LIST OF COLUMNS
        primary_nested_path = join_field(split_field(frum)[1:])
        vars_ = UNION([s.value.vars() for s in listwrap(query.select)])

        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path,
                    sub_table) in enumerate(self.nested_tables.items())
        }

        active_columns = {".": []}
        for cname, cols in self.columns.items():
            if any(startswith_field(cname, v) for v in vars_):
                for c in cols:
                    if c.type in STRUCT:
                        continue
                    nest = c.nested_path[0]
                    active = active_columns.get(nest)
                    if not active:
                        active = active_columns[nest] = []
                    active.append(c)
        # ANY VARS MENTIONED WITH NO COLUMNS?
        for v in vars_:
            if not any(
                    startswith_field(cname, v)
                    for cname in self.columns.keys()):
                active_columns["."].append(
                    Column(names={self.name: v},
                           type="null",
                           es_column=".",
                           es_index=".",
                           nested_path=["."]))

        # EVERY COLUMN, AND THE INDEX IT TAKES UP
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        index_to_uid = {}  # FROM NESTED PATH TO THE INDEX OF UID
        sql_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path,
                    sub_table) in enumerate(self.nested_tables.items())
        }

        sorts = []
        if query.sort:
            for s in query.sort:
                col = s.value.to_sql(self)[0]
                for t, sql in col.sql.items():
                    json_type = sql_type_to_json_type[t]
                    if json_type in STRUCT:
                        continue
                    column_number = len(sql_selects)
                    # SQL HAS ABS TABLE REFERENCE
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(sql + " AS " + column_alias)
                    if s.sort == -1:
                        sorts.append(column_alias + " IS NOT NULL")
                        sorts.append(column_alias + " DESC")
                    else:
                        sorts.append(column_alias + " IS NULL")
                        sorts.append(column_alias)

        primary_doc_details = Data()
        # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH
        # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED
        for nested_path, sub_table in self.nested_tables.items():
            nested_doc_details = {
                "sub_table": sub_table,
                "children": [],
                "index_to_column": {},
                "nested_path":
                [nested_path
                 ]  # fake the real nested path, we only look at [0] anyway
            }

            # INSERT INTO TREE
            if not primary_doc_details:
                primary_doc_details = nested_doc_details
            else:

                def place(parent_doc_details):
                    if startswith_field(nested_path,
                                        parent_doc_details['nested_path'][0]):
                        for c in parent_doc_details['children']:
                            if place(c):
                                return True
                        parent_doc_details['children'].append(
                            nested_doc_details)

                place(primary_doc_details)

            alias = nested_doc_details['alias'] = nest_to_alias[nested_path]

            # WE ALWAYS ADD THE UID AND ORDER
            column_number = index_to_uid[nested_path] = nested_doc_details[
                'id_coord'] = len(sql_selects)
            sql_select = alias + "." + quoted_UID
            sql_selects.append(sql_select + " AS " +
                               _make_column_name(column_number))
            if nested_path != ".":
                sql_select = alias + "." + quote_table(ORDER)
                sql_selects.append(sql_select + " AS " +
                                   _make_column_name(column_number))

            # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM
            if nested_path not in active_columns:
                continue

            if primary_nested_path == nested_path:
                # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE
                si = 0
                for s in listwrap(query.select):
                    try:
                        column_number = len(sql_selects)
                        s.pull = get_column(column_number)
                        db_columns = s.value.to_sql(self)

                        if isinstance(s.value, LeavesOp):
                            for column in db_columns:
                                for t, unsorted_sql in column.sql.items():
                                    json_type = sql_type_to_json_type[t]
                                    if json_type in STRUCT:
                                        continue
                                    column_number = len(sql_selects)
                                    # SQL HAS ABS TABLE REFERENCE
                                    column_alias = _make_column_name(
                                        column_number)
                                    sql_selects.append(unsorted_sql + " AS " +
                                                       column_alias)
                                    index_to_column[
                                        column_number] = nested_doc_details[
                                            'index_to_column'][
                                                column_number] = Data(
                                                    push_name=concat_field(
                                                        s.name, column.name),
                                                    push_column=si,
                                                    push_child=".",
                                                    pull=get_column(
                                                        column_number),
                                                    sql=unsorted_sql,
                                                    type=json_type,
                                                    nested_path=[nested_path]
                                                    # fake the real nested path, we only look at [0] anyway
                                                )
                                    si += 1
                        else:
                            for column in db_columns:
                                for t, unsorted_sql in column.sql.items():
                                    json_type = sql_type_to_json_type[t]
                                    if json_type in STRUCT:
                                        continue
                                    column_number = len(sql_selects)
                                    # SQL HAS ABS TABLE REFERENCE
                                    column_alias = _make_column_name(
                                        column_number)
                                    sql_selects.append(unsorted_sql + " AS " +
                                                       column_alias)
                                    index_to_column[
                                        column_number] = nested_doc_details[
                                            'index_to_column'][
                                                column_number] = Data(
                                                    push_name=s.name,
                                                    push_column=si,
                                                    push_child=column.name,
                                                    pull=get_column(
                                                        column_number),
                                                    sql=unsorted_sql,
                                                    type=json_type,
                                                    nested_path=[nested_path]
                                                    # fake the real nested path, we only look at [0] anyway
                                                )
                    finally:
                        si += 1
            elif startswith_field(nested_path, primary_nested_path):
                # ADD REQUIRED COLUMNS, FOR DEEP STUFF
                for ci, c in enumerate(active_columns[nested_path]):
                    if c.type in STRUCT:
                        continue

                    column_number = len(sql_selects)
                    nested_path = c.nested_path
                    unsorted_sql = nest_to_alias[
                        nested_path[0]] + "." + quote_table(c.es_column)
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(unsorted_sql + " AS " + column_alias)
                    index_to_column[column_number] = nested_doc_details[
                        'index_to_column'][column_number] = Data(
                            push_name=s.name,
                            push_column=si,
                            push_child=relative_field(c.name, s.name),
                            pull=get_column(column_number),
                            sql=unsorted_sql,
                            type=c.type,
                            nested_path=nested_path)

        where_clause = query.where.to_sql(self, boolean=True)[0].sql.b

        unsorted_sql = self._make_sql_for_one_nest_in_set_op(
            ".", sql_selects, where_clause, active_columns, index_to_column)

        for n, _ in self.nested_tables.items():
            sorts.append(COLUMN + unicode(index_to_uid[n]))

        ordered_sql = ("SELECT * FROM (\n" + unsorted_sql + "\n)" +
                       "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " +
                       quote_value(query.limit))
        result = self.db.query(ordered_sql)

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id,
                               parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Data()
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and
                                      row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO
                    output = unwraplist(output)
                    return output if output else None

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Data()
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    if isinstance(query.select, list) or isinstance(
                            query.select.value, LeavesOp):
                        # ASSIGN INNER PROPERTIES
                        for i, c in nested_doc_details[
                                'index_to_column'].items():
                            value = row[i]
                            if value == None:
                                continue
                            if value == '':
                                continue

                            relative_path = relative_field(
                                concat_field(c.push_name, c.push_child),
                                curr_nested_path)
                            if relative_path == ".":
                                doc = value
                            else:
                                doc[relative_path] = value
                    else:
                        # ASSIGN INNER PROPERTIES
                        for i, c in nested_doc_details[
                                'index_to_column'].items():
                            value = row[i]
                            if value is not None:
                                relative_path = relative_field(
                                    c.push_child, curr_nested_path)
                                if relative_path == ".":
                                    doc = value
                                else:
                                    doc[relative_path] = value
                    output.append(doc)

                # ASSIGN NESTED ARRAYS
                for child_details in nested_doc_details['children']:
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(
                            rows, row, child_details, doc_id, id_coord)
                        if nested_value is not None:
                            path = child_details['nested_path'][0]
                            doc[path] = nested_value

                try:
                    row = rows.pop()
                except IndexError:
                    output = unwraplist(output)
                    return output if output else None

        cols = tuple(index_to_column.values())

        if query.format == "cube":
            num_rows = len(result.data)
            num_cols = MAX([c.push_column
                            for c in cols]) + 1 if len(cols) else 0
            map_index_to_name = {c.push_column: c.push_name for c in cols}
            temp_data = [[None] * num_rows for _ in range(num_cols)]
            for rownum, d in enumerate(result.data):
                for c in cols:
                    if c.push_child == ".":
                        temp_data[c.push_column][rownum] = c.pull(d)
                    else:
                        column = temp_data[c.push_column][rownum]
                        if column is None:
                            column = temp_data[c.push_column][rownum] = {}
                        column[c.push_child] = c.pull(d)

            output = Data(
                meta={"format": "cube"},
                data={n: temp_data[c]
                      for c, n in map_index_to_name.items()},
                edges=[{
                    "name": "rownum",
                    "domain": {
                        "type": "rownum",
                        "min": 0,
                        "max": num_rows,
                        "interval": 1
                    }
                }])
            return output
        elif query.format == "table":
            num_column = MAX([c.push_column for c in cols]) + 1
            header = [None] * num_column
            for c in cols:
                # header[c.push_column] = c.push_name
                sf = split_field(c.push_name)
                if len(sf) == 0:
                    header[c.push_column] = "."
                elif len(sf) == 1:
                    header[c.push_column] = sf[0]
                else:
                    # TABLES ONLY USE THE FIRST-LEVEL PROPERTY NAMES
                    # PUSH ALL DEEPER NAMES TO CHILD
                    header[c.push_column] = sf[0]
                    c.push_child = join_field(sf[1:] +
                                              split_field(c.push_child))

            output_data = []
            for d in result.data:
                row = [None] * num_column
                for c in cols:
                    set_column(row, c.push_column, c.push_child, c.pull(d))
                output_data.append(row)
            return Data(meta={"format": "table"},
                        header=header,
                        data=output_data)
        else:
            rows = list(reversed(unwrap(result.data)))
            row = rows.pop()
            output = Data(meta={"format": "list"},
                          data=listwrap(
                              _accumulate_nested(rows, row,
                                                 primary_doc_details, None,
                                                 None)))
            return output
示例#36
0
def get_selects(query):
    schema = query.frum.schema
    query_level = len(schema.query_path)
    query_path = schema.query_path[0]
    # SPLIT select INTO ES_SELECT AND RESULTSET SELECT
    split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path)

    def expand_split_select(c_nested_path):
        es_select = split_select.get(c_nested_path)
        if not es_select:
            temp = [(k, v) for k, v in split_select.items()]
            split_select.clear()
            split_select.update({c_nested_path: ESSelectOp(c_nested_path)})
            split_select.update(temp)
        return split_select[c_nested_path]

    new_select = FlatList()
    post_expressions = {}

    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])

    # WHAT PATH IS _source USED, IF ANY?
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            if any(c.jx_type == NESTED for c in leaves):
                split_select["."].source_path = "."
        elif is_op(select.value, Variable):
            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(OBJECT,
                                                               EXISTS)):
                if selected_column.jx_type == NESTED:
                    expand_split_select(
                        selected_column.es_column
                    ).source_path = selected_column.es_column
                    continue
                leaves = schema.leaves(selected_column.es_column)
                for c in leaves:
                    if c.jx_type == NESTED:
                        split_select[c.es_column].source_path = c.es_column

    # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD
    source_path = None
    source_level = 0
    for level, es_select in enumerate(reversed(list(split_select.values()))):
        if source_path:
            es_select.source_path = source_path
        elif es_select.source_path:
            source_level = level + 1
            source_path = es_select.source_path

    def get_pull_source(c):
        nested_path = c.nested_path
        nested_level = len(nested_path)
        pos = text(nested_level)

        if nested_level <= query_level:
            if not source_level or nested_level < source_level:
                field = join_field([pos, "fields", c.es_column])
                return jx_expression_to_function(field)
            elif nested_level == source_level:
                field = relative_field(c.es_column, nested_path[0])

                def pull_source(row):
                    return untyped(row.get(pos, Null)._source[field])

                return pull_source
            else:
                field = relative_field(c.es_column, nested_path[0])

                def pull_property(row):
                    return untyped(row.get(pos, Null)[field])

                return pull_property
        else:
            pos = text(query_level)

            if not source_level or nested_level < source_level:
                # PULL FIELDS AND THEN AGGREGATE THEM
                value = jx_expression_to_function(
                    join_field(["fields", c.es_column]))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function("_nested.offset")

                def pull_nested_field(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = unwraplist(v)
                    return acc

                return pull_nested_field
            else:
                # PULL SOURCES
                value = jx_expression_to_function(
                    concat_field("_source",
                                 relative_field(c.es_column, nested_path[0])))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function(
                    join_field(["_nested"] * (len(c.nested_path) - 1) +
                               ["offset"]))

                def pull_nested_source(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = untyped(v)
                    return acc

                return pull_nested_source

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                c_nested_path = c.nested_path[0]
                simple_name = relative_field(c.es_column,
                                             query_path).lstrip(".")
                name = concat_field(select.name, untype_path(simple_name))
                put_name = concat_field(
                    select.name, literal_field(untype_path(simple_name)))
                split_select[c_nested_path].fields.append(c.es_column)
                new_select.append({
                    "name": name,
                    "value": Variable(c.es_column),
                    "put": {
                        "name": put_name,
                        "index": put_index,
                        "child": ".",
                    },
                    "pull": get_pull_source(c),
                })
                put_index += 1
        elif is_op(select.value, Variable):
            if select.value.var == ".":
                # PULL ALL SOURCE
                new_select.append({
                    "name":
                    select.name,
                    "value":
                    select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull":
                    get_pull_source(
                        Data(es_column=query_path,
                             nested_path=schema.query_path)),
                })
                continue

            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(EXISTS,
                                                               OBJECT)):
                if selected_column.jx_type == NESTED:
                    new_select.append({
                        "name":
                        select.name,
                        "value":
                        select.value,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                        "pull":
                        get_pull_source(
                            Data(
                                es_column=selected_column.es_column,
                                nested_path=(selected_column.es_column, ) +
                                selected_column.nested_path,
                            )),
                    })
                    continue

                leaves = schema.leaves(
                    selected_column.es_column,
                    exclude_type=INTERNAL)  # LEAVES OF OBJECT
                if leaves:
                    for c in leaves:
                        if c.es_column == "_id":
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": ".",
                                },
                                "pull": pull_id,
                            })
                            continue
                        c_nested_path = c.nested_path[0]
                        expand_split_select(c_nested_path).fields.append(
                            c.es_column)
                        child = untype_path(
                            relative_field(
                                c.es_column,
                                selected_column.es_column,
                            ))
                        new_select.append({
                            "name": select.name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": select.name,
                                "index": put_index,
                                "child": child,
                            },
                            "pull": get_pull_source(c),
                        })

                else:
                    new_select.append({
                        "name": select.name,
                        "value": NULL,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                    })
                put_index += 1
        else:
            op, split_scripts = split_expression_by_path(select.value,
                                                         schema,
                                                         lang=Painless)
            for p, script in split_scripts.items():
                es_select = split_select[p]
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[script].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function(
                        join_field([
                            text(p),
                            "fields",
                            select.name,
                        ])),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1

    def inners(query_path, parent_pos):
        """
        :param query_path:
        :return:  ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE  row[len(nested_path)] HAS INNER HITS
                  AND row[0] HAS post_expressions
        """
        pos = text(int(parent_pos) + 1)
        if not query_path:

            def base_case(row):
                extra = {}
                for k, e in post_expressions.items():
                    extra[k] = e(row)
                row["0"] = extra
                yield row

            return base_case

        if pos == "1":
            more = inners(query_path[:-1], "1")

            def first_case(results):
                for result in results:
                    for hit in result.hits.hits:
                        seed = {"0": None, pos: hit}
                        for row in more(seed):
                            yield row

            return first_case

        else:
            more = inners(query_path[:-1], pos)
            if source_path and source_path < query_path[-1]:
                rel_path = relative_field(query_path[-1], source_path)

                def source(acc):
                    for inner_row in acc[parent_pos][rel_path]:
                        acc[pos] = inner_row
                        for tt in more(acc):
                            yield tt

                return source
            else:
                path = literal_field(query_path[-1])

                def recurse(acc):
                    hits = acc[parent_pos].inner_hits[path].hits.hits
                    if hits:
                        for inner_row in hits:
                            acc[pos] = inner_row
                            for tt in more(acc):
                                yield tt
                    else:
                        for tt in more(acc):
                            yield tt

                return recurse

    return new_select, split_select, inners(schema.query_path, "0")
示例#37
0
 def test_relative(self):
     self.assertEqual(relative_field("testing", "testing"), ".")
        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))
示例#39
0
    def remove(self, column_name, column):
        if column_name != relative_field(column.name, self.nested_path[0]):
            Log.error("Logic error")

        self.namespace[column_name] = [c for c in self.namespace[column_name] if c != column]
    def construct_docs(self, cursor, append, please_stop):
        """
        :param cursor: ITERATOR OF RECORD TUPLES
        :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT
        :return: (count, first, next, next_key)
        number of documents added
        the first document in the batch
        the first document of the next batch
        """
        null_values = set(self.settings.null_values) | {None}

        doc_count = 0

        columns = tuple(wrap(c) for c in self.columns)
        with Timer("Downloading from MySQL"):
            curr_doc = Null
            row_count = 0
            for row in cursor:
                row_count += 1
                if please_stop:
                    Log.error("Got `please_stop` signal")

                nested_path = []
                next_object = Data()

                for c, value in zip(columns, row):
                    # columns ARE IN ORDER, FROM FACT ['.'] TO EVER-DEEPER-NESTED
                    if value in null_values:
                        # EVERY COLUMN THAT'S NOT NEEDED IS None
                        continue
                    if len(nested_path) < len(c.nested_path):
                        # EACH COLUMN IS DEEPER THAN THE NEXT
                        # THESE WILL BE THE id COLUMNS, WHICH ARE ALWAYS INCLUDED AND BEFORE ALL OTHER VALUES
                        nested_path = unwrap(c.nested_path)
                        next_object = Data()
                    next_object[c.put] = value

                # OBJECT HAS BEEN CONSTRUCTED, LET'S PLACE IT WHERE IT BELONGS
                if len(nested_path) > 1:
                    children = [curr_doc]
                    steps = list(reversed(nested_path))
                    parent_path = steps[0]
                    for path in steps[1:]:
                        parent = children[-1]
                        relative_path = relative_field(path, parent_path)
                        children = unwrap(parent[relative_path])
                        if not children:
                            children = parent[relative_path] = []
                        parent_path = path

                    children.append(next_object)
                    continue

                # THE TOP-LEVEL next_object HAS BEEN ENCOUNTERED, EMIT THE PREVIOUS, AND COMPLETED curr_doc
                if curr_doc == next_object:
                    Log.error(
                        "Expecting records. Did you select the wrong schema, or select records that do not exist?"
                    )

                if curr_doc:
                    append(curr_doc["id"])
                    doc_count += 1
                curr_doc = next_object

            # DEAL WITH LAST RECORD
            if curr_doc:
                append(curr_doc["id"])
                doc_count += 1

        Log.note(
            "{{doc_count}} documents ({{row_count}} db records)",
            doc_count=doc_count,
            row_count=row_count,
        )
示例#41
0
 def get_column_name(self, column):
     return relative_field(column.name, self.sf.fact_name)
示例#42
0
def to_sql(self, schema, not_null=False, boolean=False):
    if self.var == GUID:
        return wrap([{
            "name": ".",
            "sql": {
                "s": quoted_GUID
            },
            "nested_path": ROOT_PATH
        }])
    vars = schema[self.var]
    if not vars:
        # DOES NOT EXIST
        return wrap([{
            "name": ".",
            "sql": {
                "0": SQL_NULL
            },
            "nested_path": ROOT_PATH
        }])
    var_name = list(set(listwrap(vars).names.get('\\.')))
    if len(var_name) > 1:
        Log.error("do not know how to handle")
    var_name = var_name[0]
    cols = schema.leaves(self.var)
    acc = {}
    if boolean:
        for col in cols:
            cname = relative_field(col.names['.'], var_name)
            nested_path = col.nested_path[0]
            if col.type == OBJECT:
                value = SQL_TRUE
            elif col.type == BOOLEAN:
                value = quote_column(col.es_column)
            else:
                value = quote_column(col.es_column) + SQL_IS_NOT_NULL
            tempa = acc.setdefault(nested_path, {})
            tempb = tempa.setdefault(get_property_name(cname), {})
            tempb['b'] = value
    else:
        for col in cols:
            cname = relative_field(col.names['.'], var_name)
            if col.type == OBJECT:
                prefix = self.var + "."
                for cn, cs in schema.items():
                    if cn.startswith(prefix):
                        for child_col in cs:
                            tempa = acc.setdefault(child_col.nested_path[0],
                                                   {})
                            tempb = tempa.setdefault(get_property_name(cname),
                                                     {})
                            tempb[json_type_to_sql_type[
                                col.type]] = quote_column(child_col.es_column)
            else:
                nested_path = col.nested_path[0]
                tempa = acc.setdefault(nested_path, {})
                tempb = tempa.setdefault(get_property_name(cname), {})
                tempb[json_type_to_sql_type[col.type]] = quote_column(
                    col.es_column)

    return wrap([{
        "name": cname,
        "sql": types,
        "nested_path": nested_path
    } for nested_path, pairs in acc.items() for cname, types in pairs.items()])
示例#43
0
    def query_metadata(self, query):
        frum, query['from'] = query['from'], self
        schema = self.sf.tables["."].schema
        query = QueryOp.wrap(query, schema)
        columns = self.sf.columns
        where = query.where
        table_name = None
        column_name = None

        if query.edges or query.groupby:
            Log.error("Aggregates(groupby or edge) are not supported")

        if where.op == "eq" and where.lhs.var == "table":
            table_name = mo_json.json2value(where.rhs.json)
        elif where.op == "eq" and where.lhs.var == "name":
            column_name = mo_json.json2value(where.rhs.json)
        else:
            Log.error("Only simple filters are expected like: \"eq\" on table and column name")

        tables = [concat_field(self.sf.fact_name, i) for i in self.tables.keys()]

        metadata = []
        if columns[-1].es_column != GUID:
            columns.append(Column(
                name=GUID,
                jx_type=STRING,
                es_column=GUID,
                es_index=self.sf.fact_name,
                nested_path=["."]
            ))

        for tname, table in zip(t, tables):
            if table_name != None and table_name != table:
                continue

            for col in columns:
                cname, ctype = untyped_column(col.es_column)
                if column_name != None and column_name != cname:
                    continue

                metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path)))

        if query.format == "cube":
            num_rows = len(metadata)
            header = ["table", "name", "type", "nested_path"]
            temp_data = dict(zip(header, zip(*metadata)))
            return Data(
                meta={"format": "cube"},
                data=temp_data,
                edges=[{
                    "name": "rownum",
                    "domain": {
                        "type": "rownum",
                        "min": 0,
                        "max": num_rows,
                        "interval": 1
                    }
                }]
            )
        elif query.format == "table":
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "table"},
                header=header,
                data=metadata
            )
        else:
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "list"},
                data=[dict(zip(header, r)) for r in metadata]
            )
示例#44
0
def _indexer(columns, query_path):
    all_names = set(unnest_path(c.name) for c in columns) | {"."}

    lookup_leaves = {}  # ALL LEAF VARIABLES
    for full_name in all_names:
        for c in columns:
            cname = relative_field(c.name, query_path)
            nfp = unnest_path(cname)
            if (
                startswith_field(nfp, full_name) and
                c.es_type not in [EXISTS, OBJECT, NESTED] and
                (c.es_column != "_id" or full_name == "_id")
            ):
                cs = lookup_leaves.setdefault(full_name, set())
                cs.add(c)
                cs = lookup_leaves.setdefault(untype_path(full_name), set())
                cs.add(c)

    lookup_variables = {}  # ALL NOT-NESTED VARIABLES
    for full_name in all_names:
        for c in columns:
            cname = relative_field(c.name, query_path)
            nfp = unnest_path(cname)
            if (
                startswith_field(nfp, full_name) and
                c.es_type not in [EXISTS, OBJECT] and
                (c.es_column != "_id" or full_name == "_id") and
                startswith_field(c.nested_path[0], query_path)
            ):
                cs = lookup_variables.setdefault(full_name, set())
                cs.add(c)
                cs = lookup_variables.setdefault(untype_path(full_name), set())
                cs.add(c)

    relative_lookup = {}
    for c in columns:
        try:
            cname = relative_field(c.name, query_path)
            cs = relative_lookup.setdefault(cname, set())
            cs.add(c)

            ucname = untype_path(cname)
            cs = relative_lookup.setdefault(ucname, set())
            cs.add(c)
        except Exception as e:
            Log.error("Should not happen", cause=e)

    if query_path != ".":
        # ADD ABSOLUTE NAMES TO THE NAMESAPCE
        absolute_lookup, more_leaves, more_variables = _indexer(columns, ".")
        for k, cs in absolute_lookup.items():
            if k not in relative_lookup:
                relative_lookup[k] = cs
        for k, cs in more_leaves.items():
            if k not in lookup_leaves:
                lookup_leaves[k] = cs
        for k, cs in more_variables.items():
            if k not in lookup_variables:
                lookup_variables[k] = cs

    return relative_lookup, lookup_leaves, lookup_variables
示例#45
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    split_select = {".": ESSelect('.')}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select


    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select('.').use_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."},
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."}
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select('.').use_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": put_index, "child": "."},
                    "pull": get_pull_source(".")
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select('.').use_source = True
                    for c in leaves:
                        if len(c.nested_path) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(decode_property(n) for n in split_field(c.name))
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                "pull": get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": "."},
                                    "pull": lambda row: row._id
                                })
                            elif c.jx_type == NESTED:
                                get_select('.').use_source = True
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                    "pull": get_pull_source(c.es_column)
                                })
                            else:
                                get_select(c_nested_path).fields.append(c.es_column)
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column)
                            pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path))))
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child
                                },
                                "pull": pull
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value, schema, lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))}
                new_select.append({
                    "name": select.name,
                    "pull": jx_expression_to_function("fields." + literal_field(select.name)),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
                put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select('.').use_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=True) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    # Log.note("{{output}}", output=T)

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
示例#46
0
    def get_pull_source(c):
        nested_path = c.nested_path
        nested_level = len(nested_path)
        pos = text(nested_level)

        if nested_level <= query_level:
            if not source_level or nested_level < source_level:
                field = join_field([pos, "fields", c.es_column])
                return jx_expression_to_function(field)
            elif nested_level == source_level:
                field = relative_field(c.es_column, nested_path[0])

                def pull_source(row):
                    return untyped(row.get(pos, Null)._source[field])

                return pull_source
            else:
                field = relative_field(c.es_column, nested_path[0])

                def pull_property(row):
                    return untyped(row.get(pos, Null)[field])

                return pull_property
        else:
            pos = text(query_level)

            if not source_level or nested_level < source_level:
                # PULL FIELDS AND THEN AGGREGATE THEM
                value = jx_expression_to_function(
                    join_field(["fields", c.es_column]))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function("_nested.offset")

                def pull_nested_field(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = unwraplist(v)
                    return acc

                return pull_nested_field
            else:
                # PULL SOURCES
                value = jx_expression_to_function(
                    concat_field("_source",
                                 relative_field(c.es_column, nested_path[0])))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function(
                    join_field(["_nested"] * (len(c.nested_path) - 1) +
                               ["offset"]))

                def pull_nested_source(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = untyped(v)
                    return acc

                return pull_nested_source