示例#1
0
def doc_to_column(doc):
    try:
        doc = wrap(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        doc.multi = 1001 if doc.es_type == "nested" else doc.multi

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = Date.now()

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            doc.jx_type = EXISTS

        return Column(**doc)
    except Exception:
        doc.nested_path = ["."]
        mark_as_deleted(Column(**doc))
        return None
示例#2
0
def row_to_column(header, row):
    return Column(
        **{
            h: c if c is None or h not in ("nested_path",
                                           "partitions") else json2value(c)
            for h, c in zip(header, row)
        })
示例#3
0
    def get_or_create_facts(self, fact_name, uid=UID):
        """
        FIND TABLE BY NAME, OR CREATE IT IF IT DOES NOT EXIST
        :param fact_name:  NAME FOR THE CENTRAL INDEX
        :param uid: name, or list of names, for the GUID
        :return: Facts
        """
        about = self.db.about(fact_name)
        if not about:
            if uid != UID:
                Log.error("do not know how to handle yet")

            self.ns.columns._snowflakes[fact_name] = ["."]
            self.ns.columns.add(Column(
                name="_id",
                es_column="_id",
                es_index=fact_name,
                es_type=json_type_to_sqlite_type[STRING],
                jx_type=STRING,
                nested_path=['.'],
                multi=1,
                last_updated=Date.now()
            ))
            command = sql_create(fact_name, {UID: "INTEGER PRIMARY KEY", GUID: "TEXT"}, unique=UID)

            with self.db.transaction() as t:
                t.execute(command)

        return QueryTable(fact_name, self)
示例#4
0
    def select(self, select):
        selects = listwrap(select)

        if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".":
            new_schema = self.schema
            if selects[0].name == ".":
                return self
        else:
            new_schema = None

        if is_list(select):
            if all(
                is_op(s.value, Variable) and s.name == s.value.var
                for s in select
            ):
                names = set(s.value.var for s in select)
                new_schema = Schema(".", [c for c in self.schema.columns if c.name in names])

            push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
            def selector(d):
                output = Data()
                for n, p in push_and_pull:
                    output[n] = unwraplist(p(to_data(d)))
                return unwrap(output)

            new_data = list(map(selector, self.data))
        else:
            select_value = jx_expression_to_function(select.value)
            new_data = list(map(select_value, self.data))
            if is_op(select.value, Variable):
                column = dict(**first(c for c in self.schema.columns if c.name == select.value.var))
                column.update({"name": ".", "jx_type": NESTED, "es_type": "nested", "multi":1001, "cardinality":1})
                new_schema = Schema("from " + self.name, [Column(**column)])

        return ListContainer("from "+self.name, data=new_data, schema=new_schema)
示例#5
0
 def __init__(self, nested_path):
     if nested_path[-1] != ".":
         Log.error("Expecting full nested path")
     source = Column(name=".",
                     jx_type=OBJECT,
                     es_type=OBJECT,
                     es_column="_source",
                     es_index=nested_path,
                     nested_path=nested_path)
     guid = Column(name=GUID,
                   jx_type=STRING,
                   es_type='TEXT',
                   es_column=GUID,
                   es_index=nested_path,
                   nested_path=nested_path)
     self.namespace = {".": {source}, GUID: {guid}}
     self._columns = [source, guid]
     self.nested_path = nested_path
示例#6
0
    def __init__(self, db):
        self.db = db
        self._snowflakes = {}  # MAP FROM BASE TABLE TO LIST OF NESTED TABLES
        self._columns = ColumnList()

        # FIND ALL TABLES
        result = self.db.query(
            "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = wrap([{k: d[i]
                        for i, k in enumerate(result.header)}
                       for d in result.data])
        last_nested_path = []
        for table in tables:
            if table.name.startswith("__"):
                continue
            base_table, nested_path = tail_field(table.name)

            # FIND COMMON NESTED PATH SUFFIX
            for i, p in enumerate(last_nested_path):
                if startswith_field(nested_path, p):
                    last_nested_path = last_nested_path[i:]
                    break
            else:
                last_nested_path = []

            full_nested_path = [nested_path] + last_nested_path
            nested_tables = self._snowflakes.setdefault(
                base_table, [nested_path] + last_nested_path)
            nested_tables.append(
                jx_base.TableDesc(name=table.name,
                                  nested_path=full_nested_path))

            # LOAD THE COLUMNS
            command = "PRAGMA table_info" + sql_iso(quote_column(table.name))
            details = self.db.query(command)

            for cid, name, dtype, notnull, dfft_value, pk in details.data:
                if name.startswith("__"):
                    continue
                cname, ctype = untyped_column(name)
                self._columns.add(
                    Column(
                        name=cname,  # I THINK COLUMNS HAVE THIER FULL PATH
                        jx_type=coalesce(
                            ctype, {
                                "TEXT": "string",
                                "REAL": "number",
                                "INTEGER": "integer"
                            }.get(dtype)),
                        nested_path=full_nested_path,
                        es_type=dtype,
                        es_column=name,
                        es_index=table.name))
            last_nested_path = full_nested_path
示例#7
0
    def _load_from_database(self):
        # FIND ALL TABLES
        result = self.db.query(
            sql_query({
                "from": "sqlite_master",
                "where": {
                    "eq": {
                        "type": "table"
                    }
                },
                "orderby": "name"
            }))
        tables = wrap([{k: d
                        for k, d in zip(result.header, row)}
                       for row in result.data])
        last_nested_path = ["."]
        for table in tables:
            if table.name.startswith("__"):
                continue
            base_table, nested_path = tail_field(table.name)

            # FIND COMMON NESTED PATH SUFFIX
            if nested_path == ".":
                last_nested_path = []
            else:
                for i, p in enumerate(last_nested_path):
                    if startswith_field(nested_path, p):
                        last_nested_path = last_nested_path[i:]
                        break
                else:
                    last_nested_path = []

            full_nested_path = [nested_path] + last_nested_path
            self._snowflakes[literal_field(base_table)] += [full_nested_path]

            # LOAD THE COLUMNS
            details = self.db.about(table.name)

            for cid, name, dtype, notnull, dfft_value, pk in details:
                if name.startswith("__"):
                    continue
                cname, ctype = untyped_column(name)
                self.add(
                    Column(name=cname,
                           jx_type=coalesce(sql_type_to_json_type.get(ctype),
                                            IS_NULL),
                           nested_path=full_nested_path,
                           es_type=dtype,
                           es_column=name,
                           es_index=table.name,
                           last_updated=Date.now()))
            last_nested_path = full_nested_path
示例#8
0
def doc_to_column(doc):
    try:
        doc = wrap(untyped(doc))
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)
        doc.multi = 1001 if doc.es_type == "nested" else doc.multi

        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = Date.now()

        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
        return Column(**doc)
    except Exception:
        doc.nested_path = ["."]
        mark_as_deleted(Column(**doc))
        return None
示例#9
0
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param prefix_path: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = _type_to_name[d.__class__]
        if row_type != "object":
            full_name = join_field(prefix_path)
            column = columns[full_name]
            if not column:
                column = Column(names={table_name: full_name},
                                es_column=full_name,
                                es_index=".",
                                type="undefined",
                                nested_path=nested_path)
                columns.add(column)
            column.type = _merge_type[column.type][row_type]
        else:
            for name, value in d.items():
                full_name = join_field(prefix_path + [name])
                column = columns[full_name]
                if not column:
                    column = Column(names={table_name: full_name},
                                    es_column=full_name,
                                    es_index=".",
                                    type="undefined",
                                    nested_path=nested_path)
                    columns.add(column)
                if isinstance(value, list):
                    if len(value) == 0:
                        this_type = "undefined"
                    elif len(value) == 1:
                        this_type = _type_to_name[value[0].__class__]
                    else:
                        this_type = _type_to_name[value[0].__class__]
                        if this_type == "object":
                            this_type = "nested"
                else:
                    this_type = _type_to_name[value.__class__]
                new_type = _merge_type[column.type][this_type]
                column.type = new_type

                if this_type == "object":
                    _get_schema_from_list([value], table_name,
                                          prefix_path + [name], nested_path,
                                          columns)
                elif this_type == "nested":
                    np = listwrap(nested_path)
                    newpath = unwraplist(
                        [join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(value, table_name,
                                          prefix_path + [name], newpath,
                                          columns)
示例#10
0
    def test_column_constraints(self):
        multi = Column(
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="nested",
            jx_type=NESTED,
            cardinality=1,
            multi=2,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            multi=1,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            multi=0,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            nested_path=".",
            last_updated=Date.now(),
        )
示例#11
0
    def test_change_column_property(self):

        row = Column(
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="nested",
            jx_type=NESTED_TYPE,
            multi=1001,
            nested_path=["."],
            last_updated=Date.now(),
        )

        def set_bad_multi():
            row.multi = None

        self.assertRaises(Exception, set_bad_multi)
示例#12
0
    def _load_from_database(self):
        # FIND ALL TABLES
        result = self.db.query(
            "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = wrap([{k: d
                        for k, d in zip(result.header, row)}
                       for row in result.data])
        last_nested_path = []
        for table in tables:
            if table.name.startswith("__"):
                continue
            base_table, nested_path = tail_field(table.name)

            # FIND COMMON NESTED PATH SUFFIX
            for i, p in enumerate(last_nested_path):
                if startswith_field(nested_path, p):
                    last_nested_path = last_nested_path[i:]
                    break
            else:
                last_nested_path = []

            full_nested_path = [nested_path] + last_nested_path
            self._snowflakes[literal_field(base_table)] += [full_nested_path]

            # LOAD THE COLUMNS
            command = "PRAGMA table_info" + sql_iso(quote_column(table.name))
            details = self.db.query(command)

            for cid, name, dtype, notnull, dfft_value, pk in details.data:
                if name.startswith("__"):
                    continue
                cname, ctype = untyped_column(name)
                self.add(
                    Column(name=cname,
                           jx_type=coalesce(sql_type_to_json_type.get(ctype),
                                            IS_NULL),
                           nested_path=full_nested_path,
                           es_type=dtype,
                           es_column=name,
                           es_index=table.name,
                           last_updated=Date.now()))
            last_nested_path = full_nested_path
示例#13
0
    def create_snowflake(self, fact_name, uid=UID):
        """
        MAKE NEW TABLE WITH GIVEN guid
        :param fact_name:  NAME FOR THE CENTRAL FACTS
        :param uid: name, or list of names, for the GUID
        :return: Facts
        """
        self.add_table_to_schema(["."])

        uid = listwrap(uid)
        new_columns = []
        for u in uid:
            if u == UID:
                pass
            else:
                c = Column(
                    name=u,
                    jx_type=STRING,
                    es_column=typed_column(u, "string"),
                    es_index=fact_name
                )
                self.add_column_to_schema(c)
                new_columns.append(c)

        command = (
            "CREATE TABLE " + quote_column(fact_name) + sql_iso(sql_list(
                [quoted_GUID + " TEXT "] +
                [quoted_UID + " INTEGER"] +
                [quote_column(c.es_column) + " " + json_type_to_sqlite_type[c.jx_type] for c in self.tables["."].schema.columns] +
                ["PRIMARY KEY " + sql_iso(sql_list(
                    [quoted_GUID] +
                    [quoted_UID] +
                    [quote_column(c.es_column) for c in self.tables["."].schema.columns]
                ))]
            ))
        )

        self.db.execute(command)

        snowflake = Snowflake(fact_name, self)
        return Facts(self, snowflake)
示例#14
0
    def create_or_replace_facts(self, fact_name, uid=UID):
        """
        MAKE NEW TABLE WITH GIVEN guid
        :param fact_name:  NAME FOR THE CENTRAL FACTS
        :param uid: name, or list of names, for the GUID
        :return: Facts
        """
        self.remove_snowflake(fact_name)
        self._snowflakes[fact_name] = ["."]

        uid = listwrap(uid)
        new_columns = []
        for u in uid:
            if u == UID:
                pass
            else:
                c = Column(name=u,
                           jx_type=mo_json.STRING,
                           es_column=typed_column(
                               u, json_type_to_sql_type[mo_json.STRING]),
                           es_type=json_type_to_sqlite_type[mo_json.STRING],
                           es_index=fact_name,
                           last_updated=Date.now())
                self.add_column_to_schema(c)
                new_columns.append(c)

        command = ("CREATE TABLE " + quote_column(fact_name) + sql_iso(
            sql_list([quoted_GUID + " TEXT "] + [quoted_UID + " INTEGER"] + [
                quote_column(c.es_column) + " " + c.es_type
                for c in new_columns
            ] + [
                "PRIMARY KEY " + sql_iso(
                    sql_list([quoted_GUID] + [quoted_UID] +
                             [quote_column(c.es_column) for c in new_columns]))
            ])))

        with self.db.transaction() as t:
            t.execute(command)

        snowflake = Snowflake(fact_name, self)
        return Facts(self, snowflake)
示例#15
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))
def doc_to_column(doc):
    now = Date.now()
    try:
        doc = to_data(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        if doc.es_type == "nested":
            doc.multi = 1001
        if doc.multi == None:
            doc.multi = 1

        # FIX
        if doc.es_column.endswith("." + NESTED_TYPE):
            if doc.jx_type == OBJECT:
                doc.jx_type = NESTED
                doc.last_updated = now
            if doc.es_type == "nested":
                doc.es_type = "nested"
                doc.last_updated = now

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = now

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
            doc.last_updated = now

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
                    doc.last_updated = now
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
                doc.last_updated = now

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            if doc.jx_type != EXISTS:
                doc.jx_type = EXISTS
                doc.last_updated = now

            if doc.cardinality == None:
                doc.cardinality = 1
                doc.last_updated = now

        # FIX
        if doc.jx_type in STRUCT:
            if doc.cardinality not in [0, 1]:
                doc.cardinality = 1  # DO NOT KNOW IF EXISTS OR NOT
                doc.last_updated = now

        return Column(**doc)
    except Exception as e:
        try:
            mark_as_deleted(Column(**doc), now)
        except Exception:
            pass
        return None
示例#17
0
    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)
        clear_columns = set(listwrap(command['clear']))

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | clear_columns
        for c in self.schema.columns:
            if c.name in touched_columns and len(c.nested_path) > 1:
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where) or TRUE
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars for c in self.columns.get(v, Null)
            if c.jx_type not in STRUCT
        }
        where_sql = where.map(_map).to_sql(self.schema)[0].sql.b
        new_columns = set(command.set.keys()) - set(
            c.name for c in self.schema.columns)
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_jx_type(nested_value)
            column = Column(name=new_column_name,
                            jx_type=ctype,
                            es_index=self.name,
                            es_type=json_type_to_sqlite_type(ctype),
                            es_column=typed_column(new_column_name, ctype),
                            last_updated=Date.now())
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_jx_type(nested_value) == "nested":
                nested_table_name = concat_field(self.name, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(
                    quote_column(c.es_column) for u in self.uid
                    for c in self.columns[u])
                extra_key_name = UID + text(len(self.uid))
                extra_key = [e
                             for e in nested_table.columns[extra_key_name]][0]

                sql_command = (
                    SQL_DELETE + SQL_FROM + quote_column(nested_table.name) +
                    SQL_WHERE + "EXISTS" +
                    sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM +
                            sql_alias(quote_column(nested_table.name), "n") +
                            SQL_INNER_JOIN +
                            sql_iso(SQL_SELECT + self_primary_key + SQL_FROM +
                                    quote_column(abs_schema.fact) + SQL_WHERE +
                                    where_sql) + " t ON " +
                            SQL_AND.join(
                                quote_column("t", c.es_column) + SQL_EQ +
                                quote_column("n", c.es_column)
                                for u in self.uid for c in self.columns[u])))
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d,
                                         Data(),
                                         doc_collection,
                                         path=nested_column_name)

                prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso(
                    sql_list([self_primary_key] + [quote_column(extra_key)] + [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]))

                # BUILD THE PARENT TABLES
                parent = (SQL_SELECT + self_primary_key + SQL_FROM +
                          quote_column(abs_schema.fact) + SQL_WHERE +
                          jx_expression(command.where).to_sql(schema))

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT +
                    sql_alias(quote_value(i), extra_key.es_column) +
                    SQL_COMMA + sql_list(
                        sql_alias(quote_value(row[c.name]),
                                  quote_column(c.es_column))
                        for c in doc_collection.get(".", Null).active_columns)
                    for i, row in enumerate(
                        doc_collection.get(".", Null).rows))

                sql_command = (prefix + SQL_SELECT + sql_list([
                    quote_column("p", c.es_column) for u in self.uid
                    for c in self.columns[u]
                ] + [quote_column("c", extra_key)] + [
                    quote_column("c", c.es_column)
                    for c in doc_collection.get(".", Null).active_columns
                ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN +
                               sql_iso(children) + " c" + SQL_ON + SQL_TRUE)

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(name=c.name,
                                        jx_type=c.jx_type,
                                        es_type=c.es_type,
                                        es_index=c.es_index,
                                        es_column=c.es_column,
                                        nested_path=[nested_column_name] +
                                        c.nested_path,
                                        last_updated=Date.now())
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.jx_type not in [
                                c.jx_type for c in self.columns[c.name]
                        ]:
                            self.columns[column.name].add(column)

        command = ConcatSQL(
            SQL_UPDATE, quote_column(self.name), SQL_SET,
            sql_list([
                quote_column(c.es_column) + SQL_EQ +
                quote_value(get_if_type(v, c.jx_type))
                for c in self.schema.columns
                if c.jx_type != NESTED and len(c.nested_path) == 1
                for v in [command.set[c.name]] if v != None
            ] + [
                quote_column(c.es_column) + SQL_EQ + SQL_NULL
                for c in self.schema.columns
                if (c.name in clear_columns and command.set[c.name] != None
                    and c.jx_type != NESTED and len(c.nested_path) == 1)
            ]), SQL_WHERE, where_sql)

        with self.db.transaction() as t:
            t.execute(command)
示例#18
0
        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if is_data(data):
                items = [(concat_field(full_path, k), v)
                         for k, v in wrap(data).leaves()]
            else:
                # PRIMITIVE VALUES
                items = [(full_path, data)]

            for cname, v in items:
                jx_type = get_jx_type(v)
                if jx_type is None:
                    continue

                insertion = doc_collection[nested_path[0]]
                if jx_type == NESTED:
                    c = first(cc for cc in insertion.active_columns +
                              snowflake.columns if cc.jx_type in STRUCT
                              and untyped_column(cc.name)[0] == cname)
                else:
                    c = first(cc for cc in insertion.active_columns +
                              snowflake.columns
                              if cc.jx_type == jx_type and cc.name == cname)

                if isinstance(c, list):
                    Log.error("confused")

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in snowflake.query_paths:
                        if startswith_field(cname, path[0]) and len(
                                deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(name=cname,
                               jx_type=jx_type,
                               es_type=json_type_to_sqlite_type.get(
                                   jx_type, jx_type),
                               es_column=typed_column(
                                   cname, json_type_to_sql_type.get(jx_type)),
                               es_index=table,
                               cardinality=0,
                               nested_path=nested_path,
                               last_updated=Date.now())
                    if jx_type == NESTED:
                        snowflake.query_paths.append(c.es_column)
                        required_changes.append({'nest': c})
                    else:
                        insertion.active_columns.add(c)
                        required_changes.append({"add": c})
                elif c.jx_type == NESTED and jx_type == OBJECT:
                    # ALWAYS PROMOTE OBJECTS TO NESTED
                    jx_type = NESTED
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    snowflake._remove_column(c)
                    required_changes.append({"nest": c})
                    deep_c = Column(name=cname,
                                    jx_type=jx_type,
                                    es_type=json_type_to_sqlite_type.get(
                                        jx_type, jx_type),
                                    es_column=typed_column(
                                        cname,
                                        json_type_to_sql_type.get(jx_type)),
                                    es_index=table,
                                    nested_path=nested_path,
                                    last_updated=Date.now())
                    snowflake._add_column(deep_c)
                    snowflake._drop_column(c)
                    from_doc.active_columns.remove(c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.container.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)
                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {
                        UID: self.container.next_uid(),
                        PARENT: uid,
                        ORDER: order
                    }
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if jx_type == NESTED:
                    deeper_nested_path = [cname] + nested_path
                    if not doc_collection.get(cname):
                        doc_collection[cname] = Data(active_columns=Queue(),
                                                     rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.container.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif jx_type == OBJECT:
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.jx_type:
                    row[c.es_column] = v
示例#19
0
                    )
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )


METADATA_COLUMNS = (
    [
        Column(
            name=c,
            es_index="meta.columns",
            es_column=c,
            es_type="keyword",
            jx_type=STRING,
            last_updated=Date.now(),
            nested_path=ROOT_PATH,
        )
        for c in [
            "name",
            "es_type",
            "jx_type",
            "nested_path",
            "es_column",
            "es_index",
            "partitions",
        ]
    ]
    + [
示例#20
0
    def _set_op(self, query):
        # GET LIST OF SELECTED COLUMNS
        vars_ = UNION([
            v.var for select in listwrap(query.select)
            for v in select.value.vars()
        ])
        schema = self.schema
        known_vars = schema.keys()

        active_columns = {".": set()}
        for v in vars_:
            for c in schema.leaves(v):
                nest = c.nested_path[0]
                active_columns.setdefault(nest, set()).add(c)

        # ANY VARS MENTIONED WITH NO COLUMNS?
        for v in vars_:
            if not any(startswith_field(cname, v) for cname in known_vars):
                active_columns["."].add(
                    Column(name=v,
                           jx_type=IS_NULL,
                           es_column=".",
                           es_index=".",
                           es_type='NULL',
                           nested_path=["."],
                           last_updated=Date.now()))

        # EVERY COLUMN, AND THE INDEX IT TAKES UP
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        index_to_uid = {}  # FROM NESTED PATH TO THE INDEX OF UID
        sql_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        nest_to_alias = {
            nested_path[0]: "__" + unichr(ord('a') + i) + "__"
            for i, nested_path in enumerate(self.snowflake.query_paths)
        }

        sorts = []
        if query.sort:
            for select in query.sort:
                col = SQLang[select.value].to_sql(schema)[0]
                for t, sql in col.sql.items():
                    json_type = sql_type_to_json_type[t]
                    if json_type in STRUCT:
                        continue
                    column_number = len(sql_selects)
                    # SQL HAS ABS TABLE REFERENCE
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(sql_alias(sql, column_alias))
                    if select.sort == -1:
                        sorts.append(quote_column(column_alias) + SQL_IS_NULL)
                        sorts.append(quote_column(column_alias) + " DESC")
                    else:
                        sorts.append(quote_column(column_alias) + SQL_IS_NULL)
                        sorts.append(quote_column(column_alias))

        primary_doc_details = Data()
        # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH
        # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED
        nested_path = []
        for step, sub_table in self.snowflake.tables:
            nested_path.insert(0, step)
            nested_doc_details = {
                "sub_table": sub_table,
                "children": [],
                "index_to_column": {},
                "nested_path": nested_path
            }

            # INSERT INTO TREE
            if not primary_doc_details:
                primary_doc_details = nested_doc_details
            else:

                def place(parent_doc_details):
                    if startswith_field(step,
                                        parent_doc_details['nested_path'][0]):
                        for c in parent_doc_details['children']:
                            if place(c):
                                return True
                        parent_doc_details['children'].append(
                            nested_doc_details)

                place(primary_doc_details)

            alias = nested_doc_details['alias'] = nest_to_alias[step]

            # WE ALWAYS ADD THE UID
            column_number = index_to_uid[step] = nested_doc_details[
                'id_coord'] = len(sql_selects)
            sql_select = quote_column(alias, UID)
            sql_selects.append(
                sql_alias(sql_select, _make_column_name(column_number)))
            if step != ".":
                # ID AND ORDER FOR CHILD TABLES
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number))
                column_number = len(sql_selects)
                sql_select = quote_column(alias, ORDER)
                sql_selects.append(
                    sql_alias(sql_select, _make_column_name(column_number)))
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number))

            # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM
            if step not in active_columns:
                continue

            # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE
            si = 0
            for select in listwrap(query.select):
                try:
                    column_number = len(sql_selects)
                    select.pull = get_column(column_number)
                    db_columns = SQLang[select.value].partial_eval().to_sql(
                        schema)

                    for column in db_columns:
                        for t, unsorted_sql in column.sql.items():
                            json_type = sql_type_to_json_type[t]
                            if json_type in STRUCT:
                                continue
                            column_number = len(sql_selects)
                            column_alias = _make_column_name(column_number)
                            sql_selects.append(
                                sql_alias(unsorted_sql, column_alias))
                            if startswith_field(schema.path, step) and is_op(
                                    select.value, LeavesOp):
                                # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN
                                index_to_column[
                                    column_number] = nested_doc_details[
                                        'index_to_column'][
                                            column_number] = ColumnMapping(
                                                push_name=literal_field(
                                                    get_property_name(
                                                        concat_field(
                                                            select.name,
                                                            column.name))),
                                                push_child=".",
                                                push_column_name=
                                                get_property_name(
                                                    concat_field(
                                                        select.name,
                                                        column.name)),
                                                push_column=si,
                                                pull=get_column(column_number),
                                                sql=unsorted_sql,
                                                type=json_type,
                                                column_alias=column_alias,
                                                nested_path=nested_path)
                                si += 1
                            else:
                                index_to_column[
                                    column_number] = nested_doc_details[
                                        'index_to_column'][
                                            column_number] = ColumnMapping(
                                                push_name=select.name,
                                                push_child=column.name,
                                                push_column_name=select.name,
                                                push_column=si,
                                                pull=get_column(column_number),
                                                sql=unsorted_sql,
                                                type=json_type,
                                                column_alias=column_alias,
                                                nested_path=nested_path)
                finally:
                    si += 1

        where_clause = BooleanOp(query.where).partial_eval().to_sql(
            schema, boolean=True)[0].sql.b
        unsorted_sql = self._make_sql_for_one_nest_in_set_op(
            ".", sql_selects, where_clause, active_columns, index_to_column)

        for n, _ in self.snowflake.tables:
            sorts.append(quote_column(COLUMN + text(index_to_uid[n])))

        ordered_sql = ConcatSQL(
            (SQL_SELECT, SQL_STAR, SQL_FROM,
             sql_iso(unsorted_sql), SQL_ORDERBY, sql_list(sorts), SQL_LIMIT,
             quote_value(query.limit)))
        result = self.db.query(ordered_sql)

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id,
                               parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Null
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and
                                      row[parent_id_coord] != parent_doc_id):
                    rows.append(
                        row
                    )  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Null
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details[
                        'index_to_column'].items()
                    for i, c in index_to_column:
                        value = row[i]
                        if is_list(query.select) or is_op(
                                query.select.value, LeavesOp):
                            # ASSIGN INNER PROPERTIES
                            relative_field = concat_field(
                                c.push_name, c.push_child)
                        else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                            relative_field = c.push_child

                        if relative_field == ".":
                            if exists(value):
                                doc = value
                        elif exists(value):
                            if doc is Null:
                                doc = Data()
                            doc[relative_field] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(
                            rows, row, child_details, doc_id, id_coord)
                        if nested_value != None:
                            push_name = child_details['nested_path'][0]
                            if is_list(query.select) or is_op(
                                    query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_field = relative_field(
                                    push_name, curr_nested_path)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_field = "."

                            if relative_field == ".":
                                doc = unwraplist(nested_value)
                            else:
                                doc[relative_field] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output

        cols = tuple(
            [i for i in index_to_column.values() if i.push_name != None])
        rows = list(reversed(unwrap(result.data)))
        if rows:
            row = rows.pop()
            data = _accumulate_nested(rows, row, primary_doc_details, None,
                                      None)
        else:
            data = result.data

        if query.format == "cube":
            # for f, full_name in self.snowflake.tables:
            #     if f != '.' or (test_dots(cols) and is_list(query.select)):
            #         num_rows = len(result.data)
            #         num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0
            #         map_index_to_name = {c.push_column: c.push_column_name for c in cols}
            #         temp_data = [[None] * num_rows for _ in range(num_cols)]
            #         for rownum, d in enumerate(result.data):
            #             for c in cols:
            #                 if c.push_child == ".":
            #                     temp_data[c.push_column][rownum] = c.pull(d)
            #                 else:
            #                     column = temp_data[c.push_column][rownum]
            #                     if column is None:
            #                         column = temp_data[c.push_column][rownum] = {}
            #                     column[c.push_child] = c.pull(d)
            #         output = Data(
            #             meta={"format": "cube"},
            #             data={n: temp_data[c] for c, n in map_index_to_name.items()},
            #             edges=[{
            #                 "name": "rownum",
            #                 "domain": {
            #                     "type": "rownum",
            #                     "min": 0,
            #                     "max": num_rows,
            #                     "interval": 1
            #                 }
            #             }]
            #         )
            #         return output

            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                num_rows = len(data)
                temp_data = {
                    c.push_column_name: [None] * num_rows
                    for c in cols
                }
                for rownum, d in enumerate(data):
                    for c in cols:
                        temp_data[c.push_column_name][rownum] = d[c.push_name]
                return Data(meta={"format": "cube"},
                            data=temp_data,
                            edges=[{
                                "name": "rownum",
                                "domain": {
                                    "type": "rownum",
                                    "min": 0,
                                    "max": num_rows,
                                    "interval": 1
                                }
                            }])
            else:
                num_rows = len(data)
                map_index_to_name = {
                    c.push_column: c.push_column_name
                    for c in cols
                }
                temp_data = [data]

                return Data(meta={"format": "cube"},
                            data={
                                n: temp_data[c]
                                for c, n in map_index_to_name.items()
                            },
                            edges=[{
                                "name": "rownum",
                                "domain": {
                                    "type": "rownum",
                                    "min": 0,
                                    "max": num_rows,
                                    "interval": 1
                                }
                            }])

        elif query.format == "table":
            # for f, _ in self.snowflake.tables:
            #     if frum.endswith(f):
            #         num_column = MAX([c.push_column for c in cols]) + 1
            #         header = [None] * num_column
            #         for c in cols:
            #             header[c.push_column] = c.push_column_name
            #
            #         output_data = []
            #         for d in result.data:
            #             row = [None] * num_column
            #             for c in cols:
            #                 set_column(row, c.push_column, c.push_child, c.pull(d))
            #             output_data.append(row)
            #
            #         return Data(
            #             meta={"format": "table"},
            #             header=header,
            #             data=output_data
            #         )
            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                column_names = [None] * (max(c.push_column for c in cols) + 1)
                for c in cols:
                    column_names[c.push_column] = c.push_column_name

                temp_data = []
                for rownum, d in enumerate(data):
                    row = [None] * len(column_names)
                    for c in cols:
                        row[c.push_column] = d[c.push_name]
                    temp_data.append(row)

                return Data(meta={"format": "table"},
                            header=column_names,
                            data=temp_data)
            else:
                column_names = listwrap(query.select).name
                return Data(meta={"format": "table"},
                            header=column_names,
                            data=[[d] for d in data])

        else:
            # for f, _ in self.snowflake.tables:
            #     if frum.endswith(f) or (test_dots(cols) and is_list(query.select)):
            #         data = []
            #         for d in result.data:
            #             row = Data()
            #             for c in cols:
            #                 if c.push_child == ".":
            #                     row[c.push_name] = c.pull(d)
            #                 elif c.num_push_columns:
            #                     tuple_value = row[c.push_name]
            #                     if not tuple_value:
            #                         tuple_value = row[c.push_name] = [None] * c.num_push_columns
            #                     tuple_value[c.push_child] = c.pull(d)
            #                 else:
            #                     row[c.push_name][c.push_child] = c.pull(d)
            #
            #             data.append(row)
            #
            #         return Data(
            #             meta={"format": "list"},
            #             data=data
            #         )

            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                temp_data = []
                for rownum, d in enumerate(data):
                    row = {}
                    for c in cols:
                        row[c.push_column_name] = d[c.push_name]
                    temp_data.append(row)
                return Data(meta={"format": "list"}, data=temp_data)
            else:
                return Data(meta={"format": "list"}, data=data)
示例#21
0
    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | set(listwrap(command['clear']))
        for c in self.get_leaves():
            if c.name in touched_columns and c.nested_path and len(
                    c.name) > len(c.nested_path[0]):
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where)
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars for c in self.columns.get(v, Null)
            if c.jx_type not in STRUCT
        }
        where_sql = where.map(_map).to_sql(schema)
        new_columns = set(command.set.keys()) - set(self.columns.keys())
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_type(nested_value)
            column = Column(name=new_column_name,
                            jx_type=ctype,
                            es_index=self.facts.snowflake.fact_name,
                            es_type=json_type_to_sqlite_type(ctype),
                            es_column=typed_column(new_column_name, ctype),
                            last_updated=Date.now())
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_type(nested_value) == "nested":
                nested_table_name = concat_field(
                    self.facts.snowflake.fact_name, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(
                    quote_column(c.es_column) for u in self.uid
                    for c in self.columns[u])
                extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid))
                extra_key = [e
                             for e in nested_table.columns[extra_key_name]][0]

                sql_command = ("DELETE" + SQL_FROM +
                               quote_column(nested_table.name) + SQL_WHERE +
                               "EXISTS (" + "\nSELECT 1 " + SQL_FROM +
                               quote_column(nested_table.name) + " n" +
                               SQL_INNER_JOIN + "(" + SQL_SELECT +
                               self_primary_key + SQL_FROM +
                               quote_column(abs_schema.fact) + SQL_WHERE +
                               where_sql + "\n) t ON " +
                               SQL_AND.join("t." + quote_column(c.es_column) +
                                            " = n." + quote_column(c.es_column)
                                            for u in self.uid
                                            for c in self.columns[u]) + ")")
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d,
                                         Data(),
                                         doc_collection,
                                         path=nested_column_name)

                prefix = "INSERT INTO " + quote_column(
                    nested_table.name
                ) + sql_iso(
                    sql_list([self_primary_key] + [quote_column(extra_key)] + [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]))

                # BUILD THE PARENT TABLES
                parent = (SQL_SELECT + self_primary_key + SQL_FROM +
                          quote_column(abs_schema.fact) + SQL_WHERE +
                          jx_expression(command.where).to_sql(schema))

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT + quote_value(i) + " " +
                    quote_column(extra_key.es_column) + "," + sql_list(
                        quote_value(row[c.name]) + " " +
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns)
                    for i, row in enumerate(
                        doc_collection.get(".", Null).rows))

                sql_command = (prefix + SQL_SELECT + sql_list([
                    join_column("p", c.es_column) for u in self.uid
                    for c in self.columns[u]
                ] + [join_column("c", extra_key)] + [
                    join_column("c", c.es_column)
                    for c in doc_collection.get(".", Null).active_columns
                ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN +
                               sql_iso(children) + " c" + " ON " + SQL_TRUE)

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(name=c.name,
                                        jx_type=c.jx_type,
                                        es_type=c.es_type,
                                        es_index=c.es_index,
                                        es_column=c.es_column,
                                        nested_path=[nested_column_name] +
                                        c.nested_path,
                                        last_updated=Date.now())
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.jx_type not in [
                                c.jx_type for c in self.columns[c.name]
                        ]:
                            self.columns[column.name].add(column)

        command = (
            "UPDATE " + quote_column(abs_schema.fact) + " SET " + sql_list([
                quote_column(c) + "=" + quote_value(get_if_type(v, c.jx_type))
                for k, v in command.set.items() if get_type(v) != "nested"
                for c in self.columns[k]
                if c.jx_type != "nested" and len(c.nested_path) == 1
            ] + [
                quote_column(c) + "=" + SQL_NULL
                for k in listwrap(command['clear']) if k in self.columns
                for c in self.columns[k]
                if c.jx_type != "nested" and len(c.nested_path) == 1
            ]) + SQL_WHERE + where_sql)

        self.db.execute(command)
示例#22
0
    """
    return column.es_index + "|" + column.es_column


META_COLUMNS_DESC = TableDesc(
    name=META_COLUMNS_NAME,
    url=None,
    query_path=ROOT_PATH,
    last_updated=Date.now(),
    columns=to_data(
        [
            Column(
                name=c,
                es_index=META_COLUMNS_NAME,
                es_column=c,
                es_type="keyword",
                jx_type=STRING,
                last_updated=Date.now(),
                nested_path=ROOT_PATH,
                multi=1,
            )
            for c in [
                "name",
                "es_type",
                "jx_type",
                "es_column",
                "es_index",
                "partitions",
            ]
        ]
        + [
            Column(
示例#23
0
    def query_metadata(self, query):
        frum, query['from'] = query['from'], self
        schema = self.sf.tables["."].schema
        query = QueryOp.wrap(query, schema)
        columns = self.sf.columns
        where = query.where
        table_name = None
        column_name = None

        if query.edges or query.groupby:
            Log.error("Aggregates(groupby or edge) are not supported")

        if where.op == "eq" and where.lhs.var == "table":
            table_name = mo_json.json2value(where.rhs.json)
        elif where.op == "eq" and where.lhs.var == "name":
            column_name = mo_json.json2value(where.rhs.json)
        else:
            Log.error("Only simple filters are expected like: \"eq\" on table and column name")

        tables = [concat_field(self.sf.fact_name, i) for i in self.tables.keys()]

        metadata = []
        if columns[-1].es_column != GUID:
            columns.append(Column(
                name=GUID,
                jx_type=STRING,
                es_column=GUID,
                es_index=self.sf.fact_name,
                nested_path=["."]
            ))

        for tname, table in zip(t, tables):
            if table_name != None and table_name != table:
                continue

            for col in columns:
                cname, ctype = untyped_column(col.es_column)
                if column_name != None and column_name != cname:
                    continue

                metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path)))

        if query.format == "cube":
            num_rows = len(metadata)
            header = ["table", "name", "type", "nested_path"]
            temp_data = dict(zip(header, zip(*metadata)))
            return Data(
                meta={"format": "cube"},
                data=temp_data,
                edges=[{
                    "name": "rownum",
                    "domain": {
                        "type": "rownum",
                        "min": 0,
                        "max": num_rows,
                        "interval": 1
                    }
                }]
            )
        elif query.format == "table":
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "table"},
                header=header,
                data=metadata
            )
        else:
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "list"},
                data=[dict(zip(header, r)) for r in metadata]
            )
示例#24
0
def doc_to_column(doc):
    return Column(**wrap(untyped(doc)))
示例#25
0
def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param parent: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = python_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(
                            _merge_python_type, (vi.__class__.__name__ for vi in value)
                        )
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                column.jx_type = python_type_to_json_type[column.es_type]

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list(
                        [value], table_name, full_name, nested_path, columns
                    )
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )
示例#26
0
    :param column:
    :return: Elasticsearch id for column
    """
    return column.es_index + "|" + column.es_column


META_COLUMNS_DESC = TableDesc(name=META_COLUMNS_NAME,
                              url=None,
                              query_path=ROOT_PATH,
                              last_updated=Date.now(),
                              columns=wrap([
                                  Column(
                                      name=c,
                                      es_index=META_COLUMNS_NAME,
                                      es_column=c,
                                      es_type="keyword",
                                      jx_type=STRING,
                                      last_updated=Date.now(),
                                      nested_path=ROOT_PATH,
                                  ) for c in [
                                      "name",
                                      "es_type",
                                      "jx_type",
                                      "nested_path",
                                      "es_column",
                                      "es_index",
                                      "partitions",
                                  ]
                              ] + [
                                  Column(
                                      name=c,
示例#27
0
文件: meta.py 项目: yoyogias2011/TUID
                if this_type == "object":
                    _get_schema_from_list([value], table_name, full_name,
                                          nested_path, columns)
                elif this_type == "nested":
                    np = listwrap(nested_path)
                    newpath = unwraplist(
                        [join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(value, table_name, full_name,
                                          newpath, columns)


METADATA_COLUMNS = ([
    Column(names={".": c},
           es_index="meta.columns",
           es_column=c,
           es_type="string",
           nested_path=ROOT_PATH)
    for c in ["es_type", "jx_type", "nested_path", "es_column", "es_index"]
] + [
    Column(es_index="meta.columns",
           names={".": c},
           es_column=c,
           es_type="object",
           nested_path=ROOT_PATH) for c in ["names", "partitions"]
] + [
    Column(names={".": c},
           es_index="meta.columns",
           es_column=c,
           es_type="long",
           nested_path=ROOT_PATH) for c in ["count", "cardinality", "multi"]
示例#28
0
def _get_schema_from_list(
    frum,  # The list
    table_name,  # Name of the table this list holds records for
    parent,  # parent path
    nested_path,  # each nested array, in reverse order
    columns,  # map from full name to column definition
    native_type_to_json_type  # dict from storage type name to json type name
):
    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = native_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(_merge_python_type,
                                           (vi.__class__.__name__
                                            for vi in value))
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                try:
                    column.jx_type = native_type_to_json_type[column.es_type]
                except Exception as e:
                    raise e

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list([value], table_name, full_name,
                                          nested_path, columns,
                                          native_type_to_json_type)
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist(
                        [join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(value, table_name, full_name,
                                          newpath, columns)
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = ('.', )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = (OBJECT if p == "." else NESTED)
                self.namespace.meta.columns.add(
                    Column(name=p,
                           es_column=p,
                           es_index=self.name,
                           es_type=jx_type,
                           jx_type=jx_type,
                           cardinality=1,
                           nested_path=nested_path,
                           multi=1001 if jx_type is NESTED else 1,
                           last_updated=Date.now()))
示例#30
0
def doc_to_column(doc):
    kwargs = set_default(untyped(doc), {"last_updated": Date.now() - YEAR})
    return Column(**wrap(kwargs))
示例#31
0
        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if isinstance(data, Mapping):
                items = ((concat_field(full_path, k), v)
                         for k, v in wrap(data).leaves())
            else:
                # PRIMITIVE VALUES
                items = [(full_path, data)]

            for cname, v in items:
                value_type = get_type(v)
                if value_type is None:
                    continue

                if value_type == NESTED:
                    c = unwraplist([
                        cc for cc in snowflake.columns if cc.jx_type in STRUCT
                        and untyped_column(cc.name) == cname
                    ])
                else:
                    c = unwraplist([
                        cc for cc in snowflake.columns
                        if cc.jx_type == value_type and cc.name == cname
                    ])

                insertion = doc_collection[nested_path[0]]
                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in snowflake.query_paths:
                        if startswith_field(cname, path[0]) and len(
                                deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(name=cname,
                               jx_type=value_type,
                               es_type=json_type_to_sqlite_type.get(
                                   value_type, value_type),
                               es_column=typed_column(
                                   cname,
                                   json_type_to_sql_type.get(value_type)),
                               es_index=table,
                               nested_path=nested_path,
                               last_updated=Date.now())
                    if value_type == "nested":
                        snowflake.query_paths.append(c.es_column)
                        required_changes.append({'nest': (c, nested_path)})
                    else:
                        snowflake.columns.append(c)
                        required_changes.append({"add": c})

                        # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY
                        insertion.active_columns.add(c)
                elif c.jx_type == "nested" and value_type == "object":
                    value_type = "nested"
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    snowflake._remove_column(c)
                    required_changes.append({"nest": (c, nested_path)})
                    deep_c = Column(name=cname,
                                    jx_type=value_type,
                                    es_type=json_type_to_sqlite_type.get(
                                        value_type, value_type),
                                    es_column=typed_column(
                                        cname,
                                        json_type_to_sql_type.get(value_type)),
                                    es_index=table,
                                    nested_path=nested_path,
                                    last_updated=Date.now())
                    snowflake._add_column(deep_c)
                    snowflake._drop_column(c)
                    from_doc.active_columns.remove(c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.container.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)
                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {
                        UID: self.container.next_uid(),
                        PARENT: uid,
                        ORDER: order
                    }
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if value_type == "nested":
                    row[c.es_column] = "."
                    deeper_nested_path = [cname] + nested_path
                    insertion = doc_collection.get(cname, None)
                    if not insertion:
                        insertion = doc_collection[cname] = Data(
                            active_columns=set(), rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.container.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif value_type == "object":
                    row[c.es_column] = "."
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.jx_type:
                    insertion.active_columns.add(c)
                    row[c.es_column] = v