def fix(source_key, rownum, line, source, sample_only_filter, sample_size): """ :param rownum: :param line: :param source: :param sample_only_filter: :param sample_size: :return: (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>} """ value = json2value(line) if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) elif '"resource_usage":' in line: value = _fix(value) row = {"value": value} return row, False
def update(self, command): try: command = wrap(command) eq = command.where.eq if eq.es_index: columns = self.find(eq.es_index, eq.name) columns = [ c for c in columns if all( get_attr(c, k) == v for k, v in eq.items()) ] else: columns = list(self) columns = jx.filter(columns, command.where) for col in list(columns): for k in command["clear"]: if k == ".": columns.remove(col) else: col[k] = None for k, v in command.set.items(): col[k] = v except Exception as e: Log.error("should not happen", cause=e)
def filter(self, where): if len(self.edges) == 1 and self.edges[0].domain.type == "index": # USE THE STANDARD LIST FILTER from jx_python import jx return jx.filter(self.data.values()[0].cube, where) else: # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS Log.unexpected("Incomplete")
def filter(self, where): if len(self.edges)==1 and self.edges[0].domain.type=="index": # USE THE STANDARD LIST FILTER from jx_python import jx return jx.filter(self.data.values()[0].cube, where) else: # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS Log.unexpected("Incomplete")
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = mo_json.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if source.name.startswith("active-data-codecoverage"): d = convert.json2value(line) if d.source.file.total_covered > 0: return {"id": d._id, "json": line}, False else: return None, False if rownum == 0: value = mo_json.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = mo_json.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = mo_json.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def update(self, command): self.dirty = True try: command = wrap(command) eq = command.where.eq if eq.es_index: all_columns = self.data.get(eq.es_index, {}).values() if len(eq) == 1: # FASTEST with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: for k in command["clear"]: if k == ".": lst = self.data[col.es_index] cols = lst[col.names['.']] cols.remove(col) if len(cols) == 0: del lst[col.names['.']] if len(lst) == 0: del self.data[col.es_index] else: col[k] = None for k, v in command.set.items(): col[k] = v except Exception as e: Log.error("should not happen", cause=e)
def fix(rownum, line, source, sample_only_filter, sample_size): value = json2value(line) if value._id.startswith(("tc.97", "96", "bb.27")): # AUG 24, 25 2017 - included full diff with repo; too big to index try: data = json2value(line) repo = data.repo repo.etl = None repo.branch.last_used = None repo.branch.description = None repo.branch.etl = None repo.branch.parent_name = None repo.children = None repo.parents = None if repo.changeset.diff or data.build.repo.changeset.diff: Log.error("no diff allowed") else: assertAlmostEqual(minimize_repo(repo), repo) except Exception as e: if CAN_NOT_DECODE_JSON in e: raise e data.repo = minimize_repo(repo) data.build.repo = minimize_repo(data.build.repo) line = value2json(data) else: pass if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) elif line.find('"resource_usage":') != -1: value = _fix(value) row = {"value": value} return row, False
def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: mark_as_deleted(c) self.todo.add(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col) self.todo.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add(col) except Exception as e: Log.error("should not happen", cause=e)
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, ))
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query( """ SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database, ) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: lhs, rhs = map(strings.trim, r.split("->")) lhs = lhs.split(".") if len(lhs) == 2: lhs = [self.settings.database.schema] + lhs rhs = rhs.split(".") if len(rhs) == 2: rhs = [self.settings.database.schema] + rhs to_add = Data( ordinal_position=1, # CAN ONLY HANDLE 1-COLUMN RELATIONS table_schema=lhs[0], table_name=lhs[1], column_name=lhs[2], referenced_table_schema=rhs[0], referenced_table_name=rhs[1], referenced_column_name=rhs[2], ) # CHECK IF EXISTING if jx.filter(raw_relations, {"eq": to_add}): Log.note("Relation {{relation}} already exists", relation=r) continue to_add.constraint_name = Random.hex(20) raw_relations.append(to_add) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select( raw_relations, [ { "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }, ], ) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if (r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name): ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index), # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index], }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id, } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i, }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = {( r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema, ) for r in relations} related_column_table_schema_triples = ( foreign_column_table_schema_triples | referenced_column_table_schema_triples) columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif ( c.column_name, c.table_name, c.table_schema, ) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign, # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path, }] todo.append( Data( position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False, )) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def _get_spot_prices_from_aws(self): with Timer("Read no capacity file"): try: # FILE IS LIST OF {instance_type, last_failure} OBJECTS content = self.no_capacity_file.read() self.no_capacity = dict( (r.instance_type, r.last_failure) for r in convert.json2value( content, flexible=False, leaves=False)) except Exception as e: self.no_capacity = {} with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() cache = ListContainer(name=None, data=cache) most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": { "value": "timestamp", "aggregate": "max" } }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX( [Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note( "get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce( self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token) next_token = resultset.next_token for p in resultset: prices.add( wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter( prices, {"gte": { "timestamp": { "date": "today-2day" } }}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return ListContainer(name="prices", data=prices)
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs))
def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": with self.locker: del self.data[eq.es_index] self.todo.add( ( EXECUTE, "DELETE FROM " + db_table_name + SQL_WHERE + " es_index=" + quote_value(eq.es_index), ) ) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": self.todo.add((DELETE, col)) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add((UPDATE, col)) except Exception as e: Log.error("should not happen", cause=e)
def test_simple_depth_filter(self): data = [Data(**{u'test_build': {u'name': u'Firefox'}})] result = jx.filter(data, {u'term': {u'test_build.name': u'Firefox'}}) assert len(result) == 1
def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining)
def test_split_filter(self): data = [{u'testrun': {u'suite': u'tp5o'}, u'result': {u'test_name': u'digg.com'}}] result = jx.filter(data, {u'and': [{u'term': {u'testrun.suite': u'tp5o'}}, {u'term': {u'result.test_name': u'digg.com'}}]}) assert len(result) == 1
def fix(source_key, rownum, line, source, sample_only_filter, sample_size): """ :param rownum: :param line: :param source: :param sample_only_filter: :param sample_size: :return: (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>} """ value = json2value(line) if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) elif '"resource_usage":' in line: value = _fix(value) row = {"value": value} return row, False