def _build_list_sql(self, db, first, batch_size): # TODO: ENSURE THE LAST COLUMN IS THE id if first: dim = len(self._extract.field) where = SQL_OR.join( sql_iso( sql_and( quote_column(f) + ineq(i, e, dim) + db.quote_value(Date(v) if t == "time" else v) for e, (f, v, t) in enumerate( zip(self._extract.field[0:i + 1:], first, self._extract.type[0:i + 1:])))) for i in range(dim)) else: where = SQL_TRUE selects = [] for t, f in zip(self._extract.type, self._extract.field): if t == "time": selects.append( "CAST" + sql_iso(sql_alias(quote_column(f), SQL("DATETIME(6)")))) else: selects.append(quote_column(f)) sql = (SQL_SELECT + sql_list(selects) + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + where + SQL_ORDERBY + sql_list(quote_column(f) for f in self._extract.field) + SQL_LIMIT + db.quote_value(batch_size)) return sql
def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min != None and max != None: # SPECIAL CASE (BETWEEN) sql = quote_column(col) + SQL(" BETWEEN ") + quote_value(min) + SQL_AND + quote_value(max) else: sql = SQL_AND.join( quote_column(col) + name2sign[sign] + quote_value(value) for sign, value in r.items() ) return sql
def _sort2sql(self, sort): """ RETURN ORDER BY CLAUSE """ if not sort: return "" return SQL_ORDERBY + sql_list([quote_column(o.field) + (" DESC" if o.sort == -1 else "") for o in sort])
def get_sql(self, get_ids): sql = self._compose_sql(get_ids) # ORDERING sort = [] ordering = [] for ci, c in enumerate(self.columns): if c.sort: sort.append(quote_column(c.column_alias) + SQL_IS_NOT_NULL) sort.append(quote_column(c.column_alias)) ordering.append(ci) union_all_sql = SQL_UNION_ALL.join(sql) union_all_sql = (SQL_SELECT + SQL_STAR + SQL_FROM + sql_alias(sql_iso(union_all_sql), quote_column('a')) + SQL_ORDERBY + sql_list(sort)) return union_all_sql
def update(self, query): self.db.execute(""" UPDATE {{table_name}} SET {{assignment}} {{where}} """, { "table_name": query["from"], "assignment": ",".join(quote_column(k) + "=" + quote_value(v) for k, v in query.set), "where": self._where2sql(query.where) })
def _subquery(self, query, isolate=True, stacked=False): if isinstance(query, text_type): return quote_column(query), None if query.name: # IT WOULD BE SAFER TO WRAP TABLE REFERENCES IN A TYPED OBJECT (Cube, MAYBE?) return quote_column(query.name), None if query.edges: # RETURN A CUBE sql, post = self._grouped(query, stacked) else: select = listwrap(query.select) if select[0].aggregate != "none": sql, post = self._aggop(query) else: sql, post = self._setop(query) if isolate: return "(\n" + sql + "\n) a\n", post else: return sql, post
def extract(self, db, start_point, first_value, data, please_stop): Log.note( "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}", table=self.settings.snowflake.fact_table, id=first_value, start_point=start_point) id = quote_column(self._extract.field.last()) ids = (SQL_SELECT + id + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + id + " in " + sql_iso(sql_list(map(db.quote_value, data)))) sql = self.schema.get_sql(ids) with Timer("Sending SQL"): cursor = db.query(sql, stream=True, row_tuples=True) extract = self.settings.extract fact_table = self.settings.snowflake.fact_table with TempFile() as temp_file: parent_etl = None for s in start_point: parent_etl = {"id": s, "source": parent_etl} parent_etl["revision"] = get_git_revision() parent_etl["machine"] = machine_metadata def append(value, i): """ :param value: THE DOCUMENT TO ADD :return: PleaseStop """ temp_file.append( convert.value2json({ fact_table: elasticsearch.scrub(value), "etl": { "id": i, "source": parent_etl, "timestamp": Date.now() } })) with Timer("assemble data"): self.construct_docs(cursor, append, please_stop) # WRITE TO S3 s3_file_name = ".".join(map(text_type, start_point)) with Timer("write to destination {{filename}}", param={"filename": s3_file_name}): if not isinstance(self.settings.destination, text_type): destination = self.bucket.get_key(s3_file_name, must_exist=False) destination.write_lines(temp_file) else: destination = File(self.settings.destination) destination.write( convert.value2json( [convert.json2value(o) for o in temp_file], pretty=True)) return False # NOTIFY SQS now = Date.now() self.notify.add({ "bucket": self.settings.destination.bucket, "key": s3_file_name, "timestamp": now.unix, "date/time": now.format() }) # SUCCESS!! File(extract.last).write(convert.value2json([start_point, first_value]))
def _compose_sql(self, get_ids): """ :param get_ids: SQL to get the ids, and used to select the documents returned :return: """ sql = [] for nested_path in self.all_nested_paths: # MAKE THE REQUIRED JOINS sql_joins = [] for i, curr_join in enumerate( self.nested_path_to_join[nested_path[0]]): curr_join = wrap(curr_join) rel = curr_join.join_columns[0] if i == 0: sql_joins.append( SQL_FROM + sql_alias(sql_iso(get_ids), quote_column(rel.referenced.table.alias))) elif curr_join.children: full_name = quote_column(rel.table.name, rel.table.schema) sql_joins.append(SQL_JOIN + sql_alias( full_name, quote_column(rel.table.alias) ) + SQL_ON + sql_and( quote_column(const_col.column.name, rel.table.alias) + "=" + quote_column(const_col.referenced.column.name, rel.referenced.table.alias) for const_col in curr_join.join_columns)) else: full_name = quote_column(rel.referenced.table.name, rel.referenced.table.schema) sql_joins.append(SQL_LEFT_JOIN + sql_alias( full_name, quote_column(rel.referenced.table.alias) ) + SQL_ON + sql_and( quote_column(const_col.referenced.column.name, rel.referenced.table.alias) + "=" + quote_column(const_col.column.name, rel.table.alias) for const_col in curr_join.join_columns)) # ONLY SELECT WHAT WE NEED, NULL THE REST selects = [] not_null_column_seen = False for ci, c in enumerate(self.columns): if c.column_alias[1:] != text_type(ci): Log.error("expecting consistency") if c.nested_path[0] == nested_path[0]: s = sql_alias( quote_column(c.column.column.name, c.table_alias), quote_column(c.column_alias)) if s == None: Log.error("bug") selects.append(s) not_null_column_seen = True elif startswith_field(nested_path[0], c.path): # PARENT ID REFERENCES if c.column.is_id: s = sql_alias( quote_column(c.column.column.name, c.table_alias), quote_column(c.column_alias)) selects.append(s) not_null_column_seen = True else: selects.append( sql_alias(SQL_NULL, quote_column(c.column_alias))) else: selects.append( sql_alias(SQL_NULL, quote_column(c.column_alias))) if not_null_column_seen: sql.append(SQL_SELECT + sql_list(selects) + "".join(sql_joins)) return sql
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs))
def _esfilter2sqlwhere(db, esfilter): """ CONVERT ElassticSearch FILTER TO SQL FILTER db - REQUIRED TO PROPERLY QUOTE VALUES AND COLUMN NAMES """ esfilter = wrap(esfilter) if esfilter is True: return SQL_TRUE elif esfilter["and"]: return sql_iso(SQL_AND.join([esfilter2sqlwhere(db, a) for a in esfilter["and"]])) elif esfilter["or"]: return sql_iso(SQL_OR.join([esfilter2sqlwhere(db, a) for a in esfilter["or"]])) elif esfilter["not"]: return SQL_NOT + sql_iso(esfilter2sqlwhere(db, esfilter["not"])) elif esfilter.term: return sql_iso(SQL_AND.join([ quote_column(col) + SQL("=") + quote_value(val) for col, val in esfilter.term.items() ])) elif esfilter.terms: for col, v in esfilter.terms.items(): if len(v) == 0: return "FALSE" try: int_list = convert.value2intlist(v) has_null = False for vv in v: if vv == None: has_null = True break if int_list: filter = int_list_packer(col, int_list) if has_null: return esfilter2sqlwhere(db, {"or": [{"missing": col}, filter]}) elif 'terms' in filter and set(filter['terms'].get(col, []))==set(int_list): return quote_column(col) + " in " + quote_list(int_list) else: return esfilter2sqlwhere(db, filter) else: if has_null: return esfilter2sqlwhere(db, {"missing": col}) else: return "false" except Exception as e: e = Except.wrap(e) pass return quote_column(col) + " in " + quote_list(v) elif esfilter.script: return sql_iso(esfilter.script) elif esfilter.range: name2sign = { "gt": SQL(">"), "gte": SQL(">="), "lte": SQL("<="), "lt": SQL("<") } def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min != None and max != None: # SPECIAL CASE (BETWEEN) sql = quote_column(col) + SQL(" BETWEEN ") + quote_value(min) + SQL_AND + quote_value(max) else: sql = SQL_AND.join( quote_column(col) + name2sign[sign] + quote_value(value) for sign, value in r.items() ) return sql terms = [single(col, ranges) for col, ranges in esfilter.range.items()] if len(terms) == 1: output = terms[0] else: output = sql_iso(SQL_AND.join(terms)) return output elif esfilter.missing: if isinstance(esfilter.missing, text_type): return sql_iso(quote_column(esfilter.missing) + SQL_IS_NULL) else: return sql_iso(quote_column(esfilter.missing.field) + SQL_IS_NULL) elif esfilter.exists: if isinstance(esfilter.exists, text_type): return sql_iso(quote_column(esfilter.exists) + SQL_IS_NOT_NULL) else: return sql_iso(quote_column(esfilter.exists.field) + SQL_IS_NOT_NULL) elif esfilter.match_all: return SQL_TRUE elif esfilter.instr: return sql_iso(SQL_AND.join(["instr" + sql_iso(quote_column(col) + ", " + quote_value(val)) + ">0" for col, val in esfilter.instr.items()])) else: Log.error("Can not convert esfilter to SQL: {{esfilter}}", esfilter=esfilter)
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = FlatList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(sql_alias(v, quote_column(s.name + "." + k))) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(sql_alias(s.value, quote_column(s.name + "," + str(i)))) else: selects.append(sql_alias(s.value, quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = sql_alias(query.select.value, quote_column(name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() for s in query.select: selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value),quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) select = sql_alias(aggregates[s0.aggregate].replace("{{code}}", s0.value) , quote_column(s0.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = FlatList() groups = FlatList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(sql_alias(e.value, quote_column(e.name))) for s in select: selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value), quote_column(s.name))) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked