예제 #1
0
    def map_to_sql(self, var=""):
        """
        RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS
        """
        origin = self.nested_path[0]
        if startswith_field(var, origin) and origin != var:
            var = relative_field(var, origin)
        fact_dict = {}
        origin_dict = {}
        for k, cs in self.namespace.items():
            for c in cs:
                if c.jx_type in STRUCT:
                    continue

                if startswith_field(get_property_name(k), var):
                    origin_dict.setdefault(relative_field(c.name, origin),
                                           []).append(c)

                    if origin != c.nested_path[0]:
                        fact_dict.setdefault(c.name, []).append(c)
                elif origin == var:
                    origin_dict.setdefault(
                        concat_field(var, relative_field(c.name, origin)),
                        []).append(c)

                    if origin != c.nested_path[0]:
                        fact_dict.setdefault(concat_field(var, c.name),
                                             []).append(c)

        return set_default(origin_dict, fact_dict)
예제 #2
0
    def _nest_column(self, column):
        new_path, type_ = untyped_column(column.es_column)
        if type_ != SQL_NESTED_TYPE:
            Log.error("only nested types can be nested")
        destination_table = concat_field(self.fact_name, new_path)
        existing_table = concat_field(self.fact_name, column.nested_path[0])

        # FIND THE INNER COLUMNS WE WILL BE MOVING
        moving_columns = []
        for c in self.columns:
            if destination_table != column.es_index and column.es_column == c.es_column:
                moving_columns.append(c)
                c.nested_path = new_path

        # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO?

        # LOAD THE COLUMNS
        data = self.namespace.db.about(destination_table)
        if not data:
            # DEFINE A NEW TABLE
            command = (
                SQL_CREATE + quote_column(destination_table) + sql_iso(sql_list([
                    quoted_UID + "INTEGER",
                    quoted_PARENT + "INTEGER",
                    quoted_ORDER + "INTEGER",
                    "PRIMARY KEY " + sql_iso(quoted_UID),
                    "FOREIGN KEY " + sql_iso(quoted_PARENT) + " REFERENCES " + quote_column(existing_table) + sql_iso(quoted_UID)
                ]))
            )
            with self.namespace.db.transaction() as t:
                t.execute(command)
                self.add_table([new_path]+column.nested_path)

        # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY
        if not moving_columns:
            return

        column.es_index = destination_table
        with self.namespace.db.transaction() as t:
            t.execute(
                "ALTER TABLE " + quote_column(destination_table) +
                " ADD COLUMN " + quote_column(column.es_column) + " " + column.es_type
            )

            # Deleting parent columns
            for col in moving_columns:
                column = col.es_column
                tmp_table = "tmp_" + existing_table
                columns = list(map(text, t.query(SQL_SELECT + SQL_STAR + SQL_FROM + quote_column(existing_table) + SQL_LIMIT + SQL_ZERO).header))
                t.execute(
                    "ALTER TABLE " + quote_column(existing_table) +
                    " RENAME TO " + quote_column(tmp_table)
                )
                t.execute(
                    SQL_CREATE + quote_column(existing_table) + SQL_AS +
                    SQL_SELECT + sql_list([quote_column(c) for c in columns if c != column]) +
                    SQL_FROM + quote_column(tmp_table)
                )
                t.execute("DROP TABLE " + quote_column(tmp_table))
예제 #3
0
    def _nest_column(self, column, new_path):
        destination_table = concat_field(self.fact, new_path[0])
        existing_table = concat_field(self.fact, column.nested_path[0])

        # FIND THE INNER COLUMNS WE WILL BE MOVING
        moving_columns = []
        for c in self._columns:
            if destination_table != column.es_index and column.es_column == c.es_column:
                moving_columns.append(c)
                c.nested_path = new_path

        # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO?

        # DEFINE A NEW TABLE?
        # LOAD THE COLUMNS
        command = "PRAGMA table_info" + sql_iso(
            quote_column(destination_table))
        details = self.db.query(command)
        if not details.data:
            command = (
                "CREATE TABLE " + quote_column(destination_table) + sql_iso(
                    sql_list([
                        quoted_UID + "INTEGER", quoted_PARENT + "INTEGER",
                        quoted_ORDER + "INTEGER",
                        "PRIMARY KEY " + sql_iso(quoted_UID), "FOREIGN KEY " +
                        sql_iso(quoted_PARENT) + " REFERENCES " +
                        quote_column(existing_table) + sql_iso(quoted_UID)
                    ])))
            self.db.execute(command)
            self.add_table_to_schema(new_path)

        # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY
        if not moving_columns:
            return

        column.es_index = destination_table
        self.db.execute("ALTER TABLE " + quote_column(destination_table) +
                        " ADD COLUMN " + quote_column(column.es_column) + " " +
                        sql_types[column.type])

        # Deleting parent columns
        for col in moving_columns:
            column = col.es_column
            tmp_table = "tmp_" + existing_table
            columns = list(
                map(
                    text_type,
                    self.db.query(SQL_SELECT + SQL_STAR + SQL_FROM +
                                  quote_column(existing_table) + SQL_LIMIT +
                                  SQL_ZERO).header))
            self.db.execute("ALTER TABLE " + quote_column(existing_table) +
                            " RENAME TO " + quote_column(tmp_table))
            self.db.execute(
                "CREATE TABLE " + quote_column(existing_table) + " AS " +
                SQL_SELECT +
                sql_list([quote_column(c) for c in columns if c != column]) +
                SQL_FROM + quote_column(tmp_table))
            self.db.execute("DROP TABLE " + quote_column(tmp_table))
예제 #4
0
    def _edges_op(self, query, frum):
        schema = frum
        query = query.copy()  # WE WILL BE MARKING UP THE QUERY
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        outer_selects = []  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        base_table, path = schema.snowflake.fact_name, schema.nested_path
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path, sub_table) in enumerate(self.snowflake.tables)
        }

        tables = []
        for n, a in nest_to_alias.items():
            if startswith_field(path, n):
                tables.append({"nest": n, "alias": a})
        tables = jx.sort(tables, {"value": {"length": "nest"}})

        from_sql = sql_alias(quote_column(concat_field(base_table, tables[0].nest)),  tables[0].alias)
        for previous, t in zip(tables, tables[1::]):
            from_sql += (
                SQL_LEFT_JOIN + sql_alias(quote_column(concat_field(base_table, t.nest)), t.alias) +
                SQL_ON + quote_column(t.alias, PARENT) + SQL_EQ + quote_column(previous.alias, UID)
            )

        main_filter = SQLang[query.where].to_sql(schema, boolean=True)[0].sql.b

        # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH
        ons = []
        join_types = []
        wheres = []
        null_ons = [EXISTS_COLUMN + SQL_IS_NULL]
        groupby = []
        null_groupby = []
        orderby = []
        domains = []

        select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.snowflake.columns]

        for edge_index, query_edge in enumerate(query.edges):
            edge_alias = "e" + text(edge_index)

            if query_edge.value:
                edge_values = [p for c in SQLang[query_edge.value].to_sql(schema).sql for p in c.items()]

            elif not query_edge.value and any(query_edge.domain.partitions.where):
                case = SQL_CASE
                for pp, p in enumerate(query_edge.domain.partitions):
                    w = SQLang[p.where].to_sql(schema)[0].sql.b
                    t = quote_value(pp)
                    case += SQL_WHEN + w + SQL_THEN + t
                case += SQL_ELSE + SQL_NULL + SQL_END  # quote value with length of partitions
                edge_values = [("n", case)]

            elif query_edge.range:
                edge_values = SQLang[query_edge.range.min].to_sql(schema)[0].sql.items() + SQLang[query_edge.range.max].to_sql(schema)[
                    0].sql.items()
예제 #5
0
    def _nest_column(self, column, new_path):
        destination_table = concat_field(self.fact, new_path)
        existing_table = concat_field(self.fact, column.nested_path[0])

        # FIND THE INNER COLUMNS WE WILL BE MOVING
        moving_columns = []
        for c in self.columns:
            if destination_table != column.es_index and column.es_column == c.es_column:
                moving_columns.append(c)
                c.nested_path = [new_path] + c.nested_path

        # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO?

        # DEFINE A NEW TABLE?
        # LOAD THE COLUMNS
        command = "PRAGMA table_info(" + quote_table(destination_table) + ")"
        details = self.db.query(command)
        if not details.data:
            command = ("CREATE TABLE " + quote_table(destination_table) + "(" +
                       (",".join([
                           quoted_UID + " INTEGER", quoted_PARENT + " INTEGER",
                           quoted_ORDER + " INTEGER"
                       ])) + ", PRIMARY KEY (" + quoted_UID + ")" +
                       ", FOREIGN KEY (" + quoted_PARENT + ") REFERENCES " +
                       quote_table(existing_table) + "(" + quoted_UID + ")"
                       ")")
            self.db.execute(command)
            self.add_table_to_schema([new_path])

        # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY
        if not moving_columns:
            return

        column.es_index = destination_table
        self.db.execute("ALTER TABLE " + quote_table(destination_table) +
                        " ADD COLUMN " + quote_column(column.es_column) + " " +
                        sql_types[column.type])

        # Deleting parent columns
        for col in moving_columns:
            column = col.es_column
            tmp_table = "tmp_" + existing_table
            columns = self.db.query("select * from " +
                                    quote_table(existing_table) +
                                    " LIMIT 0").header
            self.db.execute("ALTER TABLE " + quote_table(existing_table) +
                            " RENAME TO " + quote_table(tmp_table))
            self.db.execute(
                "CREATE TABLE " + quote_table(existing_table) + " AS SELECT " +
                (", ".join([quote_table(c) for c in columns if c != column])) +
                " FROM " + quote_table(tmp_table))
            self.db.execute("DROP TABLE " + quote_table(tmp_table))
예제 #6
0
    def _insert(self, collection):
        for nested_path, details in collection.items():
            active_columns = wrap(list(details.active_columns))
            rows = details.rows
            table_name = concat_field(self.facts.snowflake.fact_name,
                                      nested_path)

            if table_name == self.facts.snowflake.fact_name:
                # DO NOT REQUIRE PARENT OR ORDER COLUMNS
                meta_columns = [GUID, UID]
            else:
                meta_columns = [UID, PARENT, ORDER]

            all_columns = meta_columns + active_columns.es_column

            prefix = ("INSERT INTO " + quote_column(table_name) +
                      sql_iso(sql_list(map(quote_column, all_columns))))

            # BUILD THE RECORDS
            records = SQL_UNION_ALL.join(
                SQL_SELECT +
                sql_list(quote_value(row.get(c)) for c in all_columns)
                for row in unwrap(rows))

            with self.db.transaction() as t:
                t.execute(prefix + records)
예제 #7
0
 def leaves(self):
     if self in T_PRIMITIVE:
         yield ".", self
     else:
         for k, v in self.__dict__.items():
             for p, t in v.leaves():
                 yield concat_field(k, p), t
예제 #8
0
    def denormalized(self):
        """
        THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
        THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
        """
        output = [{
            "table": concat_field(c.es_index, table),
            "name": name,
            "cardinality": c.cardinality,
            "es_column": c.es_column,
            "es_index": c.es_index,
            "last_updated": c.last_updated,
            "count": c.count,
            "nested_path": c.nested_path,
            "type": c.type
        } for tname, css in self.data.items() for cname, cs in css.items()
                  for c in cs for table, name in c.names.items()]
        #+[
        #     {
        #         "table": tname,
        #         "name": "_id",
        #         "nested_path": ["."],
        #         "type": "string"
        #     }
        #     for tname, _ in self.data.items()
        # ]
        if not self.meta_schema:
            self.meta_schema = get_schema_from_list("meta\\.columns", output)

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        return ListContainer("meta\\.columns",
                             data=output,
                             schema=self.meta_schema)
예제 #9
0
    def _insert(self, collection):
        for nested_path, details in collection.items():
            active_columns = wrap(list(details.active_columns))
            rows = details.rows
            num_rows = len(rows)
            table_name = concat_field(self.name, nested_path)

            if table_name == self.name:
                # DO NOT REQUIRE PARENT OR ORDER COLUMNS
                meta_columns = [GUID, UID]
            else:
                meta_columns = [UID, PARENT, ORDER]

            all_columns = meta_columns + active_columns.es_column  # ONLY THE PRIMITIVE VALUE COLUMNS
            command = ConcatSQL([
                SQL_INSERT,
                quote_column(table_name),
                sql_iso(sql_list(map(quote_column, all_columns))), SQL_VALUES,
                sql_list(
                    sql_iso(
                        sql_list(quote_value(row.get(c)) for c in all_columns))
                    for row in unwrap(rows))
            ])

            with self.db.transaction() as t:
                t.execute(command)
예제 #10
0
파일: meta.py 프로젝트: yoyogias2011/TUID
    def denormalized(self):
        """
        THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
        THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
        """
        with self.locker:
            self._update_meta()
            output = [
                {
                    "table": concat_field(c.es_index, untype_path(table)),
                    "name": untype_path(name),
                    "cardinality": c.cardinality,
                    "es_column": c.es_column,
                    "es_index": c.es_index,
                    "last_updated": c.last_updated,
                    "count": c.count,
                    "nested_path": [unnest_path(n) for n in c.nested_path],
                    "es_type": c.es_type,
                    "type": c.jx_type
                } for tname, css in self.data.items()
                for cname, cs in css.items() for c in cs
                if c.jx_type not in STRUCT  # and c.es_column != "_id"
                for table, name in c.names.items()
            ]

        from jx_python.containers.list_usingPythonList import ListContainer
        return ListContainer(self.name,
                             data=output,
                             schema=jx_base.Schema("meta.columns",
                                                   SIMPLE_METADATA_COLUMNS))
예제 #11
0
 def leaves(self, prefix):
     full_name = concat_field(self.nested_path, prefix)
     return set(
         c for c in self.snowflake.namespace.columns.find(
             self.snowflake.fact_name) for k in [c.name]
         if startswith_field(k, full_name) and k != GUID or k == full_name
         if c.jx_type not in [OBJECT, EXISTS])
예제 #12
0
    def _add_column(self, column):
        cname = column.name
        if column.jx_type == "nested":
            # WE ARE ALSO NESTING
            self._nest_column(column, [cname] + column.nested_path)

        table = concat_field(self.fact_name, column.nested_path[0])

        try:
            with self.namespace.db.transaction() as t:
                t.execute("ALTER TABLE" + quote_column(table) + "ADD COLUMN" +
                          quote_column(column.es_column) + column.es_type)
            self.namespace.columns.add(column)
        except Exception as e:
            if "duplicate column name" in e:
                # THIS HAPPENS WHEN MULTIPLE THREADS ARE ASKING FOR MORE COLUMNS TO STORE DATA
                # THIS SHOULD NOT BE A PROBLEM SINCE THE THREADS BOTH AGREE THE COLUMNS SHOULD EXIST
                # BUT, IT WOULD BE NICE TO MAKE LARGER TRANSACTIONS SO THIS NEVER HAPPENS
                # CONFIRM THE COLUMN EXISTS IN LOCAL DATA STRUCTURES
                for c in self.namespace.columns:
                    if c.es_column == column.es_column:
                        break
                else:
                    Log.error("Did not add column {{column}]",
                              column=column.es_column,
                              cause=e)
            else:
                Log.error("Did not add column {{column}]",
                          column=column.es_column,
                          cause=e)
예제 #13
0
def get_pull(column):
    if column.nested_path[0] == ".":
        return concat_field("fields", literal_field(column.es_column))
    else:
        depth = len(split_field(column.nested_path[0]))
        rel_name = split_field(column.es_column)[depth:]
        return join_field(["_inner"] + rel_name)
예제 #14
0
파일: setop.py 프로젝트: yoyogias2011/TUID
def get_pull(column):
    if column.nested_path[0] == ".":
        return concat_field("fields", literal_field(column.es_column))
    else:
        depth = len(split_field(column.nested_path[0]))
        rel_name = split_field(column.es_column)[depth:]
        return join_field(["_inner"] + rel_name)
예제 #15
0
 def remove_facts(self, fact_name):
     paths = self.ns.columns._snowflakes[fact_name]
     if paths:
         with self.db.transaction() as t:
             for p in paths:
                 full_name = concat_field(fact_name, p[0])
                 t.execute("DROP TABLE "+quote_column(full_name))
         self.ns.columns.remove_table(fact_name)
예제 #16
0
    def leaves(self):
        output = set(
            concat_field(name, leaf)
            for name, child_schema in self.more.items()
            for leaf in child_schema.leaves)
        if self.element.type is not None:
            output.add('.')

        return output
예제 #17
0
    def leaves(self):
        output = set(
            concat_field(name, leaf)
            for name, child_schema in self.more.items()
            for leaf in child_schema.leaves
        )
        if self.element.type is not None:
            output.add('.')

        return output
예제 #18
0
def get_nested_path(typed_path):
    # CONSTRUCT THE nested_path FROM THE typed_path
    path = split_field(typed_path)
    parent = "."
    nested_path = (parent, )
    for i, p in enumerate(path[:-1]):
        if p == ARRAY_KEY:
            step = concat_field(parent, join_field(path[0:i + 1]))
            nested_path = (step, ) + nested_path
    return nested_path
예제 #19
0
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if is_text(edge):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([
                    {  # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE
                        "name": concat_field(prefix, literal_field(relative_field(untype_path(c.name), prefix))),
                        "put": {"name": literal_field(untype_path(c.name))},
                        "value": jx_expression(c.es_column, schema=schema),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for c in schema.leaves(prefix)
                ])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {"name": literal_field(untype_path(prefix))},
                    "value": LeavesOp(Variable(prefix)),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
예제 #20
0
파일: query.py 프로젝트: rv404674/TUID
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if isinstance(edge, text_type):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([
                    {
                        "name": concat_field(prefix, literal_field(relative_field(untype_path(c.names["."]), prefix))),
                        "put": {"name": literal_field(untype_path(c.names["."]))},
                        "value": jx_expression(c.es_column, schema=schema),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for c in schema.leaves(prefix)
                ])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {"name": literal_field(untype_path(prefix))},
                    "value": jx_expression(prefix, schema=schema),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, text_type):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
예제 #21
0
    def map_to_sql(self, var=""):
        """
        RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS 
        """
        origin = self.nested_path[0]
        if startswith_field(var, origin) and origin != var:
            var = relative_field(var, origin)
        fact_dict = {}
        origin_dict = {}
        for k, cs in self.map.items():
            for c in cs:
                if c.type not in STRUCT:
                    if (startswith_field(get_property_name(k), var)):
                        if c.names[origin] in origin_dict:
                            origin_dict[c.names[origin]].append(c)
                        else:
                            origin_dict[c.names[origin]] = [c]

                        if origin != c.nested_path[0]:
                            if c.names["."] in fact_dict:
                                fact_dict[c.names["."]].append(c)
                            else:
                                fact_dict[c.names["."]] = [c]
                    elif origin == var:
                        if concat_field(var, c.names[origin]) in origin_dict:
                            origin_dict[concat_field(
                                var, c.names[origin])].append(c)
                        else:
                            origin_dict[concat_field(var,
                                                     c.names[origin])] = [c]

                        if origin != c.nested_path[0]:
                            if c.names["."] in fact_dict:
                                fact_dict[concat_field(var,
                                                       c.names["."])].append(c)
                            else:
                                fact_dict[concat_field(var,
                                                       c.names["."])] = [c]

        return set_default(origin_dict, fact_dict)
예제 #22
0
    def _add_column(self, column):
        cname = column.names["."]
        if column.type == "nested":
            # WE ARE ALSO NESTING
            self._nest_column(column, [cname] + column.nested_path)

        table = concat_field(self.fact, column.nested_path[0])

        self.db.execute("ALTER TABLE " + quote_column(table) + " ADD COLUMN " +
                        quote_column(column.es_column) + " " +
                        sql_types[column.type])

        self.add_column_to_schema(column)
예제 #23
0
    def _add_column(self, column):
        cname = column.name
        if column.jx_type == "nested":
            # WE ARE ALSO NESTING
            self._nest_column(column, [cname] + column.nested_path)

        table = concat_field(self.fact_name, column.nested_path[0])

        with self.namespace.db.transaction() as t:
            t.execute("ALTER TABLE" + quote_column(table) + "ADD COLUMN" +
                      quote_column(column.es_column) + " " + column.es_type)

        self.namespace.columns.add(column)
예제 #24
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=['.'],
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
예제 #25
0
    def _drop_column(self, column):
        # DROP COLUMN BY RENAMING IT, WITH __ PREFIX TO HIDE IT
        cname = column.name
        if column.jx_type == "nested":
            # WE ARE ALSO NESTING
            self._nest_column(column, [cname] + column.nested_path)

        table = concat_field(self.fact_name, column.nested_path[0])

        with self.namespace.db.transaction() as t:
            t.execute("ALTER TABLE" + quote_column(table) + "RENAME COLUMN" +
                      quote_column(column.es_column) + " TO " +
                      quote_column("__" + column.es_column))
        self.namespace.columns.remove(column)
예제 #26
0
    def _inner(schema, parent_name, indent):
        more_lines = []
        for k, v in schema.items():
            full_name = concat_field(parent_name, k)
            details = indent+"* "+_md_code(full_name)
            if v.type:
                details += " - "+_md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(_inner(v.properties, full_name, indent+"  "))
        return more_lines
예제 #27
0
    def _inner(schema, parent_name, indent):
        more_lines = []
        for k,v in schema.items():
            full_name = concat_field(parent_name, k)
            details = indent+"* "+_md_code(full_name)
            if v.type:
                details += " - "+_md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(_inner(v.properties, full_name, indent+"  "))
        return more_lines
예제 #28
0
def inject_secrets(config):
    """
    INJECT THE SECRETS INTO THE CONFIGURATION
    :param config: CONFIG DATA

    ************************************************************************
    ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET:
    ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com
    ************************************************************************
    """
    with Timer("get secrets"):
        options = taskcluster.optionsFromEnvironment()
        secrets = taskcluster.Secrets(options)
        acc = Data()
        for s in listwrap(SECRET_NAMES):
            acc[s] = secrets.get(concat_field(SECRET_PREFIX, s))['secret']
        set_default(config, acc)
예제 #29
0
    def get_parquet_metadata(self, path='.'):
        """
        OUTPUT PARQUET METADATA COLUMNS
        :param path: FOR INTERNAL USE
        :return: LIST OF SchemaElement
        """
        children = []
        for name, child_schema in sort_using_key(self.more.items(),
                                                 lambda p: p[0]):
            children.extend(
                child_schema.get_parquet_metadata(concat_field(path, name)))
        if self.element.type:
            children.append(self.element)

        return [
            parquet_thrift.SchemaElement(name=path, num_children=len(children))
        ] + children
예제 #30
0
    def get_parquet_metadata(
        self,
        path='.'
    ):
        """
        OUTPUT PARQUET METADATA COLUMNS
        :param path: FOR INTERNAL USE
        :return: LIST OF SchemaElement
        """
        children = []
        for name, child_schema in sort_using_key(self.more.items(), lambda p: p[0]):
            children.extend(child_schema.get_parquet_metadata(concat_field(path, name)))

        if path == '.':
            return children
        else:
            self.element.num_children = len(children)
            return [self.element] + children
예제 #31
0
파일: main.py 프로젝트: mozilla/cia-tasks
def inject_secrets(config):
    """
    INJECT THE SECRETS INTO THE CONFIGURATION
    :param config: CONFIG DATA

    ************************************************************************
    ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET:
    ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com
    ************************************************************************
    """
    with Timer("get secrets"):
        secrets = taskcluster.Secrets(config.taskcluster)
        acc = Data()
        for s in listwrap(SECRET_NAMES):
            secret_name = concat_field(SECRET_PREFIX, s)
            Log.note("get secret named {{name|quote}}", name=secret_name)
            acc[s] = secrets.get(secret_name)["secret"]
        set_default(config, acc)
예제 #32
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
예제 #33
0
 def values(self, column_name, exclude_type=STRUCT):
     """
     RETURN ALL COLUMNS THAT column_name REFERS TO
     """
     column_name = unnest_path(column_name)
     columns = self.columns
     output = []
     for path in self.query_path:
         full_path = untype_path(concat_field(path, column_name))
         for c in columns:
             if c.jx_type in exclude_type:
                 continue
             # if c.cardinality == 0:
             #     continue
             if untype_path(c.name) == full_path:
                 output.append(c)
         if output:
             return output
     return []
예제 #34
0
 def values(self, column_name, exclude_type=STRUCT):
     """
     RETURN ALL COLUMNS THAT column_name REFERS TO
     """
     column_name = unnest_path(column_name)
     columns = self.columns
     output = []
     for path in self.query_path:
         full_path = untype_path(concat_field(path, column_name))
         for c in columns:
             if c.jx_type in exclude_type:
                 continue
             # if c.cardinality == 0:
             #     continue
             if untype_path(c.name) == full_path:
                 output.append(c)
         if output:
             return output
     return []
예제 #35
0
    def add_column(self, column):
        """
        ADD COLUMN, IF IT DOES NOT EXIST ALREADY
        """
        if column.name not in self.columns:
            self.columns[column.name] = {column}
        elif column.type not in [c.type for c in self.columns[column.name]]:
            self.columns[column.name].add(column)

        if column.type == "nested":
            nested_table_name = concat_field(self.name, column.name)
            # MAKE THE TABLE
            from jx_sqlite.query_table import QueryTable

            table = QueryTable(nested_table_name, self.db, exists=False)
            self.nested_tables[column.name] = table
        else:
            self.db.execute("ALTER TABLE " + quote_table(self.name) +
                            " ADD COLUMN " + _quote_column(column) + " " +
                            column.type)
예제 #36
0
    def add(self, full_name, repetition_type, type):
        """
        :param full_name: dot delimited path to the property (use dot (".") for none)
        :param repetition_type: one of OPTIONAL or NESTED (REQUIRED is not possible)
        :param json_type: the json type to store
        :return:
        """
        base_name = self.element.name
        simple_name = relative_field(full_name, base_name)
        path = split_field(simple_name)
        output = self

        if len(path) == 0:
            return output._add_one('.', full_name, repetition_type, type)
        else:
            fname = base_name
            for p in path[:-1]:
                fname = concat_field(fname, p)
                n = output.more.get(p)
                output = n or output._add_one(p, fname, OPTIONAL, object)

            return output._add_one(path[-1], full_name, repetition_type, type)
예제 #37
0
    def convert(self, expr):
        """
        ADD THE ".$value" SUFFIX TO ALL VARIABLES
        """
        if isinstance(expr, Expression):
            vars_ = expr.vars()
            rename = {v: concat_field(v, "$value") for v in vars_}
            return expr.map(rename)

        if expr is True or expr == None or expr is False:
            return expr
        elif Math.is_number(expr):
            return expr
        elif expr == ".":
            return "."
        elif is_keyword(expr):
            #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX
            return expr + ".$value"
        elif isinstance(expr, basestring):
            Log.error("{{name|quote}} is not a valid variable name", name=expr)
        elif isinstance(expr, Date):
            return expr
        elif isinstance(expr, QueryOp):
            return self._convert_query(expr)
        elif isinstance(expr, Mapping):
            if expr["from"]:
                return self._convert_query(expr)
            elif len(expr) >= 2:
                #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
                return wrap({
                    name: self.convert(value)
                    for name, value in expr.items()
                })
            else:
                # ASSUME SINGLE-CLAUSE EXPRESSION
                k, v = expr.items()[0]
                return self.converter_map.get(k, self._convert_bop)(k, v)
        elif isinstance(expr, (list, set, tuple)):
            return wrap([self.convert(value) for value in expr])
예제 #38
0
    def add(self, full_name, repetition_type, type):
        """
        :param full_name: dot delimited path to the property (use dot (".") for none)
        :param repetition_type: one of OPTIONAL or NESTED (REQUIRED is not possible)
        :param json_type: the json type to store
        :return:
        """
        base_name = self.element.name
        simple_name = relative_field(full_name, base_name)
        path = split_field(simple_name)
        output = self

        if len(path) == 0:
            return output._add_one('.', full_name, repetition_type, type)
        else:
            fname = base_name
            for p in path[:-1]:
                fname = concat_field(fname, p)
                n = output.more.get(p)
                output = n or output._add_one(p, fname, OPTIONAL, object)

            return output._add_one(path[-1], full_name, repetition_type, type)
예제 #39
0
    def denormalized(self):
        """
        THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
        THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
        """
        output = [
            {
                "table": concat_field(c.es_index, table),
                "name": name,
                "cardinality": c.cardinality,
                "es_column": c.es_column,
                "es_index": c.es_index,
                "last_updated": c.last_updated,
                "count": c.count,
                "nested_path": c.nested_path,
                "type": c.type
            }
            for tname, css in self.data.items()
            for cname, cs in css.items()
            for c in cs
            for table, name in c.names.items()
        ]
        #+[
        #     {
        #         "table": tname,
        #         "name": "_id",
        #         "nested_path": ["."],
        #         "type": "string"
        #     }
        #     for tname, _ in self.data.items()
        # ]
        if not self.meta_schema:
            self.meta_schema = get_schema_from_list("meta\\.columns", output)

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
예제 #40
0
    def convert(self, expr):
        """
        ADD THE ".$value" SUFFIX TO ALL VARIABLES
        """
        if isinstance(expr, Expression):
            vars_ = expr.vars()
            rename = {v: concat_field(v, "$value") for v in vars_}
            return expr.map(rename)

        if expr is True or expr == None or expr is False:
            return expr
        elif Math.is_number(expr):
            return expr
        elif expr == ".":
            return "."
        elif is_variable_name(expr):
            #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX
            return expr + ".$value"
        elif isinstance(expr, basestring):
            Log.error("{{name|quote}} is not a valid variable name", name=expr)
        elif isinstance(expr, Date):
            return expr
        elif isinstance(expr, QueryOp):
            return self._convert_query(expr)
        elif isinstance(expr, Mapping):
            if expr["from"]:
                return self._convert_query(expr)
            elif len(expr) >= 2:
                #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
                return wrap({name: self.convert(value) for name, value in expr.items()})
            else:
                # ASSUME SINGLE-CLAUSE EXPRESSION
                k, v = expr.items()[0]
                return self.converter_map.get(k, self._convert_bop)(k, v)
        elif isinstance(expr, (list, set, tuple)):
            return wrap([self.convert(value) for value in expr])
예제 #41
0
파일: meta.py 프로젝트: rv404674/TUID
 def name(self):
     return concat_field(self.snowflake.name, self.query_path[0])
예제 #42
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    split_select = {".": ESSelect('.')}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select


    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select('.').use_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."},
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."}
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select('.').use_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": put_index, "child": "."},
                    "pull": get_pull_source(".")
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select('.').use_source = True
                    for c in leaves:
                        if len(c.nested_path) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(decode_property(n) for n in split_field(c.name))
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                "pull": get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": "."},
                                    "pull": lambda row: row._id
                                })
                            elif c.jx_type == NESTED:
                                get_select('.').use_source = True
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))},
                                    "pull": get_pull_source(c.es_column)
                                })
                            else:
                                get_select(c_nested_path).fields.append(c.es_column)
                                pre_child = join_field(decode_property(n) for n in split_field(c.name))
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column)
                            pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path))))
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child
                                },
                                "pull": pull
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value, schema, lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))}
                new_select.append({
                    "name": select.name,
                    "pull": jx_expression_to_function("fields." + literal_field(select.name)),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
                put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select('.').use_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=True) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    # Log.note("{{output}}", output=T)

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
예제 #43
0
def get_pull(column):
    if column.nested_path[0] == ".":
        return concat_field("fields", literal_field(column.es_column))
    else:
        rel_name = relative_field(column.es_column, column.nested_path[0])
        return concat_field("_inner", rel_name)
예제 #44
0
def parse_properties(parent_index_name, parent_name, esProperties):
    """
    RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
    """
    from pyLibrary.queries.meta import Column

    columns = FlatList()
    for name, property in esProperties.items():
        index_name = parent_index_name
        column_name = concat_field(parent_name, name)

        if property.type == "nested" and property.properties:
            # NESTED TYPE IS A NEW TYPE DEFINITION
            # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH
            self_columns = parse_properties(index_name, column_name, property.properties)
            for c in self_columns:
                c.nested_path = [column_name] + c.nested_path
            columns.extend(self_columns)
            columns.append(Column(
                es_index=index_name,
                es_column=column_name,
                names={".": column_name},
                type="nested",
                nested_path=ROOT_PATH
            ))

            continue

        if property.properties:
            child_columns = parse_properties(index_name, column_name, property.properties)
            columns.extend(child_columns)
            columns.append(Column(
                names={".": column_name},
                es_index=index_name,
                es_column=column_name,
                nested_path=ROOT_PATH,
                type="source" if property.enabled == False else "object"
            ))

        if property.dynamic:
            continue
        if not property.type:
            continue
        if property.type == "multi_field":
            property.type = property.fields[name].type  # PULL DEFAULT TYPE
            for i, (n, p) in enumerate(property.fields.items()):
                if n == name:
                    # DEFAULT
                    columns.append(Column(
                        table=index_name,
                        es_index=index_name,
                        es_column=column_name,
                        name=column_name,
                        nested_path=ROOT_PATH,
                        type=p.type
                    ))
                else:
                    columns.append(Column(
                        table=index_name,
                        es_index=index_name,
                        es_column=column_name + "\\." + n,
                        name=column_name + "\\." + n,
                        nested_path=ROOT_PATH,
                        type=p.type
                    ))
            continue

        if property.type in ["string", "boolean", "integer", "date", "long", "double"]:
            columns.append(Column(
                es_index=index_name,
                names={".": column_name},
                es_column=column_name,
                nested_path=ROOT_PATH,
                type=property.type
            ))
            if property.index_name and name != property.index_name:
                columns.append(Column(
                    es_index=index_name,
                    es_column=column_name,
                    names={".":column_name},
                    nested_path=ROOT_PATH,
                    type=property.type
                ))
        elif property.enabled == None or property.enabled == False:
            columns.append(Column(
                es_index=index_name,
                names={".": column_name},
                es_column=column_name,
                nested_path=ROOT_PATH,
                type="source" if property.enabled==False else "object"
            ))
        else:
            Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path)

    return columns
예제 #45
0
    def _value_to_column(value, schema, path, counters, def_level):
        ptype = type(value)
        ntype, dtype, ltype, jtype, itype, byte_width = python_type_to_all_types[ptype]

        if jtype is NESTED:
            if schema.element.repetition_type != REPEATED:
                Log.error("Expecting {{path|quote}} to be repeated", path=path)

            new_path = path
            if not value:
                _none_to_column(schema, new_path, get_rep_level(counters), def_level)
            else:
                try:
                    new_schema = schema.more.get('.')

                    if not new_schema:
                        if schema.locked:
                            # DEFAULT TO REQUIRED ENTRIES
                            new_schema = schema
                            schema.element.repetition_type = REQUIRED
                        else:
                            new_path = path
                            new_value = value[0]
                            ptype = type(new_value)
                            new_schema = schema.add(
                                new_path,
                                OPTIONAL,
                                ptype
                            )
                            if new_value is None or python_type_to_json_type[ptype] in PRIMITIVE:
                                values[new_path] = []
                                reps[new_path] = [0] * counters[0]
                                defs[new_path] = [0] * counters[0]
                    for k, new_value in enumerate(value):
                        new_counters = counters + (k,)
                        _value_to_column(new_value, new_schema, new_path, new_counters, def_level+1)
                finally:
                    schema.element.repetition_type = REPEATED
        elif jtype is OBJECT:
            if value is None:
                if schema.element.repetition_type == REQUIRED:
                    Log.error("{{path|quote}} is required", path=path)
                _none_to_column(schema, path, get_rep_level(counters), def_level)
            else:
                if schema.element.repetition_type == REPEATED:
                    Log.error("Expecting {{path|quote}} to be repeated", path=path)

                if schema.element.repetition_type == REQUIRED:
                    new_def_level = def_level
                else:
                    counters = counters + (0,)
                    new_def_level = def_level+1

                for name, sub_schema in schema.more.items():
                    new_path = concat_field(path, name)
                    new_value = value.get(name, None)
                    _value_to_column(new_value, sub_schema, new_path, counters, new_def_level)

                for name in set(value.keys()) - set(schema.more.keys()):
                    if schema.locked:
                        Log.error("{{path}} is not allowed in the schema", path=path)
                    new_path = concat_field(path, name)
                    new_value = value.get(name, None)
                    ptype = type(new_value)
                    sub_schema = schema.add(
                        new_path,
                        REPEATED if isinstance(new_value, list) else OPTIONAL,
                        ptype
                    )
                    if python_type_to_json_type[ptype] in PRIMITIVE:
                        values[new_path] = []
                        reps[new_path] = [0] * counters[0]
                        defs[new_path] = [0] * counters[0]
                    _value_to_column(new_value, sub_schema, new_path, counters, new_def_level)
        else:
            if jtype is STRING:
                value = value.encode('utf8')
            merge_schema(schema, path, value)
            values[path].append(value)
            if schema.element.repetition_type == REQUIRED:
                reps[path].append(get_rep_level(counters))
                defs[path].append(def_level)
            else:
                reps[path].append(get_rep_level(counters))
                defs[path].append(def_level + 1)
예제 #46
0
def extract_rows(es, es_query, query):
    is_list = isinstance(query.select, list)
    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    columns = schema.columns
    leaf_columns = set(c.names["."] for c in columns if c.type not in STRUCT and (c.nested_path[0] == "." or c.es_column == c.nested_path[0]))
    nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".")

    i = 0
    source = "fields"
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp):
            new_name_prefix = select.name + "\\." if select.name != "." else ""
            term = select.value.term
            if isinstance(term, Variable):

                if term.var == ".":
                    es_query.fields = None
                    source = "_source"
                    for cname, cs in schema.lookup.items():
                        for c in cs:
                            if c.type not in STRUCT and c.es_column != "_id":
                                new_name = new_name_prefix + literal_field(cname)
                                new_select.append({
                                    "name": new_name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": new_name, "index": i, "child": "."}
                                })
                                i += 1
                else:
                    prefix = term.var + "."
                    prefix_length = len(prefix)
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            suffix = cname[prefix_length:]
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_name = new_name_prefix + literal_field(suffix)
                                    new_select.append({
                                        "name": new_name,
                                        "value": Variable(c.es_column),
                                        "put": {"name": new_name, "index": i, "child": "."}
                                    })
                                    i += 1

        elif isinstance(select.value, Variable):
            if select.value.var == ".":
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            elif select.value.var == "_id":
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "pull": "_id",
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            elif select.value.var in nested_columns or [c for c in nested_columns if c.startswith(select.value.var+".")]:
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {"name": select.name, "index": i, "child": "."}
                })
                i += 1
            else:
                prefix = select.value.var + "."
                prefix_length = len(prefix)
                net_columns = [c for c in leaf_columns if c.startswith(prefix)]
                if not net_columns:
                    # LEAF
                    if es_query.fields is not None:
                        es_query.fields.append(encode_property(select.value.var))
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": i, "child": "."}
                    })
                    i += 1
                else:
                    # LEAVES OF OBJECT
                    for cname, cs in schema.lookup.items():
                        if cname.startswith(prefix):
                            for c in cs:
                                if c.type not in STRUCT:
                                    if es_query.fields is not None:
                                        es_query.fields.append(c.es_column)
                                    new_select.append({
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {"name": select.name, "index": i, "child": cname[prefix_length:]}
                                    })
                    i += 1
        else:
            es_query.script_fields[literal_field(select.name)] = {"script": select.value.to_ruby()}
            new_select.append({
                "name": select.name,
                "pull": "fields." + literal_field(select.name),
                "put": {"name": select.name, "index": i, "child": "."}
            })
            i += 1

    for n in new_select:
        if n.pull:
            continue
        if source == "_source":
            n.pull = concat_field("_source", n.value.var)
        elif isinstance(n.value, Variable):
            n.pull = concat_field("fields", literal_field(encode_property(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
예제 #47
0
파일: setop.py 프로젝트: rv404674/TUID
def es_setop(es, query):
    schema = query.frum.schema

    es_query, filters = es_query_template(schema.query_path[0])
    nested_filter = None
    set_default(filters[0], query.where.partial_eval().to_esfilter(schema))
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.stored_fields = FlatList()

    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    # columns = schema.columns
    # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".")

    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var))
                if c.jx_type == NESTED:
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."},
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                elif c.nested_path[0] != ".":
                    pass  # THE NESTED PARENT WILL CAPTURE THIS
                else:
                    es_query.stored_fields += [c.es_column]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {"name": literal_field(full_name), "index": put_index, "child": "."}
                    })
                    put_index += 1
        elif isinstance(select.value, Variable):
            s_column = select.value.var
            # LEAVES OF OBJECT
            leaves = schema.leaves(s_column)
            nested_selects = {}
            if leaves:
                if s_column == '.':
                    # PULL ALL SOURCE
                    es_query.stored_fields = ["_source"]
                    new_select.append({
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": put_index, "child": "."},
                        "pull": get_pull_source(".")
                    })
                elif any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    es_query.stored_fields = ["_source"]
                    for c in leaves:
                        if len(c.nested_path) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES
                            jx_name = untype_path(c.names["."])
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)},
                                "pull": get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        if len(c.nested_path) == 1:
                            jx_name = untype_path(c.names["."])
                            if c.jx_type == NESTED:
                                es_query.stored_fields = ["_source"]
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)},
                                    "pull": get_pull_source(c.es_column)
                                })

                            else:
                                es_query.stored_fields += [c.es_column]
                                new_select.append({
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}
                                })
                        else:
                            if not nested_filter:
                                where = filters[0].copy()
                                nested_filter = [where]
                                for k in filters[0].keys():
                                    filters[0][k] = None
                                set_default(
                                    filters[0],
                                    es_and([where, es_or(nested_filter)])
                                )

                            nested_path = c.nested_path[0]
                            if nested_path not in nested_selects:
                                where = nested_selects[nested_path] = Data()
                                nested_filter += [where]
                                where.nested.path = nested_path
                                where.nested.query.match_all = {}
                                where.nested.inner_hits._source = False
                                where.nested.inner_hits.stored_fields += [c.es_column]

                                child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column)
                                pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path))))
                                new_select.append({
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child
                                    },
                                    "pull": pull
                                })
                            else:
                                nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column]
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {"name": select.name, "index": put_index, "child": "."}
                })
            put_index += 1
        else:
            painless = select.value.partial_eval().to_es_script(schema)
            es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema))
            new_select.append({
                "name": select.name,
                "pull": jx_expression_to_function("fields." + literal_field(select.name)),
                "put": {"name": select.name, "index": put_index, "child": "."}
            })
            put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif isinstance(n.value, Variable):
            if es_query.stored_fields[0] == "_source":
                es_query.stored_fields = ["_source"]
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter"):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
예제 #48
0
def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param parent: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = python_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(
                            _merge_python_type, (vi.__class__.__name__ for vi in value)
                        )
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                column.jx_type = python_type_to_json_type[column.es_type]

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list(
                        [value], table_name, full_name, nested_path, columns
                    )
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )
예제 #49
0
파일: deep.py 프로젝트: rv404674/TUID
def es_deepop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for i, f in enumerate(es_filters):
        script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema)
        set_default(f, script)

    if not wheres[1]:
        # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS
        more_filter = {
            "and": [
                es_filters[0],
                {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}}
            ]
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.names["."]: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.fields = []

    is_list = isinstance(query.select, list)
    new_select = FlatList()

    i = 0
    for s in listwrap(query.select):
        if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(s.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.jx_type == NESTED:
                        continue
                    es_query.fields += [c.es_column]
                c_name = untype_path(c.names[query_path])
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(s.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."},
                    "pull": get_pull_function(c)
                })
                i += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif isinstance(s.value, Variable):
            net_columns = schema.leaves(s.value.var)
            if not net_columns:
                new_select.append({
                    "name": s.name,
                    "nested_path": ".",
                    "put": {"name": s.name, "index": i, "child": "."},
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    pull = get_pull_function(n)
                    if n.nested_path[0] == ".":
                        if n.jx_type == NESTED:
                            continue
                        es_query.fields += [n.es_column]

                    # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(n.names[np])
                        if startswith_field(c_name, s.value.var):
                            child = relative_field(c_name, s.value.var)
                            break
                    else:
                        child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var)

                    new_select.append({
                        "name": s.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": s.name,
                            "index": i,
                            "child": child
                        }
                    })
            i += 1
        else:
            expr = s.value
            for v in expr.vars():
                for c in schema[v.var]:
                    if c.nested_path[0] == ".":
                        es_query.fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + s.name
            map_to_local = MapToLocal(schema)
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python())

            new_select.append({
                "name": s.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {"name": s.name, "index": i, "child": "."}
            })
            i += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []
    def get_more(please_stop):
        more.append(es_post(
            es,
            Data(
                query={"filtered": {"filter": more_filter}},
                fields=es_query.fields
            ),
            query.limit
        ))
    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t
    #</COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
예제 #50
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            # if query.groupby:
            #     return object.__new__(DefaultDecoder, e)

            if is_text(e.value):
                Log.error("Expecting Variable or Expression, not plain string")

            if is_op(e.value, LeavesOp):
                return object.__new__(ObjectDecoder)
            elif is_op(e.value, TupleOp):
                # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
                # JUST PULL THE FIELDS
                if not all(is_op(t, Variable) for t in e.value.terms):
                    Log.error("Can only handle variables in tuples")

                e.domain = Data(
                    dimension={"fields": e.value.terms}
                )
                return object.__new__(DimFieldListDecoder)

            elif is_op(e.value, Variable):
                schema = query.frum.schema
                cols = schema.leaves(e.value.var)
                if not cols:
                    return object.__new__(DefaultDecoder)
                if len(cols) != 1:
                    return object.__new__(ObjectDecoder)
                col = first(cols)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.cardinality == None:
                    DEBUG and Log.warning(
                        "metadata for column {{name|quote}} (id={{id}}) is not ready",
                        name=concat_field(col.es_index, col.es_column),
                        id=id(col)
                    )
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                elif col.partitions == None:
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                else:
                    DEBUG and Log.note("id={{id}} has parts!!!", id=id(col))
                    if col.multi > 1 and len(col.partitions) < 10:
                        return object.__new__(MultivalueDecoder)

                    partitions = col.partitions[:limit:]
                    if e.domain.sort == -1:
                        partitions = list(reversed(sorted(partitions)))
                    else:
                        partitions = sorted(partitions)
                    e.domain = SimpleSetDomain(partitions=partitions, limit=limit)

            else:
                return object.__new__(DefaultDecoder)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder)
        if e.range:
            return object.__new__(GeneralRangeDecoder)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if is_data(fields):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder)
        else:
            Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)