def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c)
def select(self, selectList, fromPath, varName, sourceVar): path = split_field(fromPath) is_deep = len(path) > 1 heads = [] list = [] for s in selectList: if is_deep: if s.value and isKeyword(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: if s.value and isKeyword(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: code, decode = self.Parts2Term(s.domain) heads.append(code.head) list.append("Value2Pipe(" + code.body + ")\n") if len(split_field(fromPath)) > 1: output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' else: output = varName + ' = ' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' return Dict( head="".join(heads), body=output )
def select(self, selectList, fromPath, varName, sourceVar): path = split_field(fromPath) is_deep = len(path) > 1 heads = [] list = [] for s in selectList: if is_deep: if s.value and isKeyword(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: if s.value and isKeyword(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: code, decode = self.Parts2Term(s.domain) heads.append(code.head) list.append("Value2Pipe(" + code.body + ")\n") if len(split_field(fromPath)) > 1: output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join( ["Value2Pipe(" + v + ")\n" for v in list]) + ';\n' else: output = varName + ' = ' + '+"|"+'.join( ["Value2Pipe(" + v + ")\n" for v in list]) + ';\n' return Dict(head="".join(heads), body=output)
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if isinstance(field_name, Mapping) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif isinstance(field_name, list): paths = [_select_a_field(f) for f in field_name] output = DictList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = DictList() _tuple((), data, paths, 0, output) return output
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if isinstance(field_name, Mapping) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): if len(split_field(field_name)) == 1: return [(d[field_name], ) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif isinstance(field_name, list): paths = [_select_a_field(f) for f in field_name] output = DictList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = DictList() _tuple((), data, paths, 0, output) return output
def _select_a_field(field): if isinstance(field, basestring): return wrap({"name": field, "value": split_field(field)}) elif isinstance(wrap(field).value, basestring): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def select(self, fields): if isinstance(fields, Mapping): fields=fields.value if isinstance(fields, basestring): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = DictList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = DictList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Dict() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Dict() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1:s + len(c.name) + 1:] test3 = body[s - 8:s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-" * len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c)
def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index] + split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias] + split_field(query_path[0])) self._upsert_column(c)
def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n")
def _get_nested_path(field, schema): if not INDEX_CACHE: _late_import() if is_keyword(field): field = join_field([schema.es.alias] + split_field(field)) for i, f in reverse(enumerate(split_field(field))): path = join_field(split_field(field)[0:i + 1:]) if path in INDEX_CACHE: return join_field(split_field(path)[1::]) return None
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1: s + len(c.name) + 1:] test3 = body[s - 8: s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-"*len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d])
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap( { "filter": { "and": [ f0, {"nested": {"path": join_field(sub_path), "filter": f1, "inner_hits": {"size": 100000}}}, ] }, "from": 0, "size": 0, "sort": [], } ) return output, wrap([f0, f1]) else: f0 = {} output = wrap({"query": {"filtered": {"filter": f0}}, "from": 0, "size": 0, "sort": []}) return output, wrap([f0])
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def __setitem__(self, key, value): if key == "": from pyLibrary.debugs.logs import Log Log.error("key is empty string. Probably a bad idea") if key == ".": # SOMETHING TERRIBLE HAPPENS WHEN value IS NOT A Mapping; # HOPEFULLY THE ONLY OTHER METHOD RUN ON self IS unwrap() v = unwrap(value) _set(self, "_dict", v) return v if isinstance(key, str): key = key.decode("utf8") try: d = _get(self, "_dict") value = unwrap(value) if key.find(".") == -1: if value is None: d.pop(key, None) else: d[key] = value return self seq = split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: d.pop(seq[-1], None) else: d[seq[-1]] = value return self except Exception, e: raise e
def __setitem__(self, key, value): if key == "": from pyLibrary.debugs.logs import Log Log.error("key is empty string. Probably a bad idea") if isinstance(key, str): key = key.decode("utf8") d=self try: value = unwrap(value) if key.find(".") == -1: if value is None: dict.pop(d, key, None) else: dict.__setitem__(d, key, value) return self seq = split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: dict.pop(d, seq[-1], None) else: dict.__setitem__(d, seq[-1], value) return self except Exception, e: raise e
def is_setop(es, query): if not any( map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])): return False select = listwrap(query.select) if not query.edges: isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT simpleAgg = AND([ s.aggregate in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT # NO EDGES IMPLIES SIMPLER QUERIES: EITHER A SET OPERATION, OR RETURN SINGLE AGGREGATE if simpleAgg or isDeep: return True else: isSmooth = AND( (e.domain.type in ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e)
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not type2container: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) settings = set_default({ "index": split_field(frum)[0], "name": frum, }, config.default.settings) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import Query return Query(frum, schema=schema) else: return frum
def is_fieldop(es, query): if not (es.cluster.version.startswith("1.4.") or es.cluster.version.startswith("1.5.")): return False # THESE SMOOTH EDGES REQUIRE ALL DATA (SETOP) select = listwrap(query.select) if not query.edges: isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isSimple = AND( s.value != None and (s.value in ["*", "."] or is_keyword(s.value)) for s in select) noAgg = AND(s.aggregate == "none" for s in select) if not isDeep and isSimple and noAgg: return True else: isSmooth = AND(( e.domain.type in domains.ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def __setitem__(self, key, value): if key == "": from pyLibrary.debugs.logs import Log Log.error("key is empty string. Probably a bad idea") if isinstance(key, str): key = key.decode("utf8") try: d = _get(self, "_dict") value = unwrap(value) if key.find(".") == -1: if value is None: d.pop(key, None) else: d[key] = value return self seq = split_field(key) for k in seq[:-1]: d = _getdefault(d, k) if value == None: d.pop(seq[-1], None) else: d[seq[-1]] = value return self except Exception, e: raise e
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d])
def __getitem__(self, key): if key == None: return Null if key == ".": output = _get(self, "_dict") if isinstance(output, Mapping): return self else: return output if isinstance(key, str): key = key.decode("utf8") elif not isinstance(key, unicode): from pyLibrary.debugs.logs import Log Log.error("only string keys are supported") d = _get(self, "_dict") if key.find(".") >= 0: seq = split_field(key) for n in seq: if isinstance(d, NullType): d = NullType(d, n) # OH DEAR, Null TREATS n AS PATH, NOT LITERAL else: d = _getdefault(d, n) # EVERYTHING ELSE TREATS n AS LITERAL return wrap(d) else: o = d.get(key) if o == None: return NullType(d, key) return wrap(o)
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not type2container: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") settings = set_default( { "index": split_field(frum)[0], "name": frum, }, config.default.settings ) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import Query return Query(frum, schema=schema) else: return frum
def setValues(expression, constants): if not constants: return expression constants = constants.copy() # EXPAND ALL CONSTANTS TO PRIMITIVE VALUES (MVEL CAN ONLY ACCEPT PRIMITIVE VALUES) for c in constants: value = c.value n = c.name if len(split_field(n)) >= 3: continue # DO NOT GO TOO DEEP if isinstance(value, list): continue # DO NOT MESS WITH ARRAYS if isinstance(value, Mapping): for k, v in value.items(): constants.append({"name": n + "." + k, "value": v}) for c in reverse(constants):# REVERSE ORDER, SO LONGER NAMES ARE TESTED FIRST s = 0 while True: s = expression.find(c.name, s) if s == -1: break if re.match(r"\w", expression[s - 1]): break if re.match(r"\w", expression[s + len(c.name)]): break v = value2MVEL(c.value) expression = expression[:s:] + "" + v + expression[:s + len(c.name):] return expression
def __setitem__(self, key, value): assert not isinstance(key, str) d = _get(self, "__dict__") o = d["_obj"] k = d["__key__"] seq = [k] + split_field(key) _assign_to_null(o, seq, value)
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = join_field(split_field(self.name) + split_field(column.name)) # MAKE THE TABLE table = Table_usingSQLite(nested_table_name, self.db, self.uid + [UID_PREFIX+"id"+unicode(len(self.uid))], exists=False) self.nested_tables[nested_table_name] = table else: self.db.execute( "ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type )
def _get(v, k, d): for p in split_field(k): try: v = v.get(p) if v is None: return d except Exception: v = [vv.get(p) for vv in v] return v
class Json2Redshift(object): @use_settings def __init__( self, host, user, password, table, meta, # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS database=None, port=5439, settings=None): self.settings = settings self.db = Redshift(settings) INDEX_CACHE[settings.table] = wrap( {"name": settings.table}) # HACK TO GET parse_columns TO WORK columns = parse_columns(settings.table, settings.mapping.test_result.properties) nested = [c.name for c in columns if c.type == "nested"] self.columns = wrap([ c for c in columns if c.type not in ["object"] and not any( c.name.startswith(n + ".") for n in nested) ]) try: self.db.execute(""" CREATE TABLE {{table_name}} ( "_id" character varying UNIQUE, {{columns}} )""", { "table_name": self.db.quote_column(settings.table), "columns": SQL(",\n".join( self.db.quote_column(c.name) + " " + self.db.es_type2pg_type(c.type) for c in self.columns)) }, retry=False) except Exception, e: if "already exists" in e: Log.alert("Table {{table}} exists in Redshift", table=settings.table) else: Log.error("Could not make table", e) # MAKE jsonpaths FOR COPY COMMAND jsonpaths = { "jsonpaths": [ "$" + "".join("[" + convert.string2quote(p) + "]" for p in split_field(c.name)) for c in self.columns ] } content = convert.value2json(jsonpaths) content = content.replace("\\\"", "'") # PUSH TO S3 s3.Bucket(meta).write(meta.jsonspath, content)
def get_document_value(document, column): """ RETURN DOCUMENT VALUE IF MATCHES THE column (name, type) :param document: THE DOCUMENT :param column: A (name, type) PAIR :return: VALUE, IF IT IS THE SAME NAME AND TYPE """ v = document.get(split_field(column.name)[0], None) return get_if_type(v, column.type)
def _get_schema_from_list(frum, columns, prefix, nested_path, name_to_column): """ SCAN THE LIST FOR COLUMN TYPES """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix + [name]) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column if isinstance(value, list): if len(value)==0: this_type = "undefined" elif len(value)==1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path, name_to_column) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, columns, prefix + [name], newpath, name_to_column)
def __getitem__(self, key): if isinstance(key, str): key = key.decode("utf8") elif isinstance(key, int): return NullType(self, key) path = split_field(key) output = self for p in path: output = NullType(output, p) return output
def _test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ try: m = meta.singlton now = Date.now() end_time = now + MINUTE # MARK COLUMNS DIRTY m.meta.columns.update({ "clear": ["partitions", "count", "cardinality", "last_updated"], "where": { "eq": { "table": join_field(split_field(query["from"])[0:1]) } } }) # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION cols = [ c for c in m.get_columns(table_name=query["from"], force=True) if c.type not in STRUCT ] for c in cols: Log.note("Mark {{column}} dirty at {{time}}", column=c.name, time=now) c.last_updated = now - TOO_OLD m.todo.push(c) while end_time > now: # GET FRESH VERSIONS cols = [ c for c in m.get_columns(table_name=query["from"]) if c.type not in STRUCT ] for c in cols: if not c.last_updated or c.cardinality == None: Log.note( "wait for column (table={{col.table}}, name={{col.name}}) metadata to arrive", col=c) break else: break Thread.sleep(seconds=1) for c in cols: Log.note( "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}", column=c) except Exception, e: Log.warning("could not pickup columns", cause=e)
def __setitem__(self, key, value): try: d = _get(self, "__dict__") o = d["_obj"] path = d["__key__"] if path is None: return # NO NEED TO DO ANYTHING seq = [path] + split_field(key) _assign(o, seq, value) except Exception, e: raise e
def _get_schema_from_list(frum, columns, prefix, nested_path, name_to_column): """ SCAN THE LIST FOR COLUMN TYPES """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix) column = name_to_column.get(full_name) if not column: column = Column(name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns[full_name] = column column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix + [name]) column = name_to_column.get(full_name) if not column: column = Column(name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns[full_name] = column if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path, name_to_column) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, columns, prefix + [name], newpath, name_to_column)
def _decode_object(index, parent_path, path, name2index, destination=None, expected_vars=NO_VARS): if destination is None: destination = {} nested_done = False while True: c, index = skip_whitespace(index) if c == b',': continue elif c == b'"': name, index = simple_token(index, c) c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed(name, expected_vars) if child_expected and nested_done: Log.error("Expected property found after nested json. Iteration failed.") full_path = join_field(split_field(parent_path)+ [name]) if path and (path[0] == full_path or path[0].startswith(full_path+".")): # THE NESTED PROPERTY WE ARE LOOKING FOR if path[0] == full_path: new_path = path[1:] else: new_path = path nested_done = True for j, i in _decode(index - 1, full_path, new_path, name2index, expected_vars=child_expected): index = i j = {name: j} for k, v in destination.items(): j.setdefault(k, v) yield j, index continue if child_expected: # SOME OTHER PROPERTY value, index = _decode_token(index, c, full_path, path, name2index, None, expected_vars=child_expected) destination[name] = value else: # WE DO NOT NEED THIS VALUE index = jump_to_end(index, c) continue elif c == "}": break if not nested_done: yield destination, index
def insert(self, docs): doc_collection = {} for d in docs: # ASSIGN A NON-NULL PRIMARY KEY if any(v == None for v in self.uid_accessor(d)): for u in self.uid: d[u] = coalesce(d[u], unique_name()) uid = wrap({u: d[u] for u in self.uid}) self.flatten(d, uid, doc_collection) for nested_path, insertion in doc_collection.items(): active_columns = list(insertion.active_columns) vals = [[quote_value(get_document_value(d, c)) for c in active_columns] for d in insertion.rows] command = "INSERT INTO " + quote_table(join_field(split_field(self.name)+split_field(nested_path[0]))) + "(" + \ ",".join(_quote_column(c) for c in active_columns) + \ ")\n" + \ " UNION ALL\n".join("SELECT " + ",".join(vv) for vv in vals) self.db.execute(command)
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default({ "index": index, "name": frum }, _containers.config.default.settings) settings.type = None return _containers.type2container[type_](settings) elif isinstance( frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def __delitem__(self, key): if isinstance(key, str): key = key.decode("utf8") if key.find(".") == -1: dict.pop(self, key, None) return d = self seq = split_field(key) for k in seq[:-1]: d = d[k] d.pop(seq[-1], None)
def __getitem__(self, key): if isinstance(key, slice): return Null elif isinstance(key, str): key = key.decode("utf8") elif isinstance(key, int): return NullType(self, key) path = split_field(key) output = self for p in path: output = NullType(output, p) return output
def is_deepop(es, query): if query.edges or query.groupby: return False if all(s.aggregate not in (None, "none") for s in listwrap(query.select)): return False vars = query_get_all_vars(query) columns = query.frum.get_columns() if len(split_field(query.frum.name)) > 1: return True if any(c for c in columns if c.nested_path and c.name in vars): return True return False
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not type2container: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): from pyLibrary.queries.meta import FromESMetadata if frum == "meta.columns": return meta.singlton.columns elif frum == "meta.table": return meta.singlton.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = containers.config.default.type index = split_field(frum)[0] settings = set_default( { "index": index, "name": frum }, containers.config.default.settings ) settings.type = None return type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import Query return Query(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = join_field( split_field(self.name) + split_field(column.name)) # MAKE THE TABLE table = Table_usingSQLite( nested_table_name, self.db, self.uid + [UID_PREFIX + "id" + unicode(len(self.uid))], exists=False) self.nested_tables[nested_table_name] = table else: self.db.execute("ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type)
def is_deepop(es, query): if query.edges or query.groupby: return False if all(s.aggregate not in (None, "none") for s in listwrap(query.select)): return False if len(split_field(query.frum.name)) > 1: return True # ASSUME IT IS NESTED IF WE ARE ASKING FOR NESTED COLUMNS # vars_ = query_get_all_vars(query) # columns = query.frum.get_columns() # if any(c for c in columns if c.nested_path and c.name in vars_): # return True return False
def insert(self, docs): doc_collection = {} for d in docs: # ASSIGN A NON-NULL PRIMARY KEY if any(v == None for v in self.uid_accessor(d)): for u in self.uid: d[u] = coalesce(d[u], unique_name()) uid = wrap({u: d[u] for u in self.uid}) self.flatten(d, uid, doc_collection) for nested_path, insertion in doc_collection.items(): active_columns = list(insertion.active_columns) vals = [[ quote_value(get_document_value(d, c)) for c in active_columns ] for d in insertion.rows] command = "INSERT INTO " + quote_table(join_field(split_field(self.name)+split_field(nested_path[0]))) + "(" + \ ",".join(_quote_column(c) for c in active_columns) + \ ")\n" + \ " UNION ALL\n".join("SELECT " + ",".join(vv) for vv in vals) self.db.execute(command)
def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = join_field(split_field(parent_name)+[k]) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines
def is_deep(query): select = listwrap(query.select) if len(select) > 1: return False if aggregates[select[0].aggregate] not in ("none", "count"): return False if len(query.edges) <= 1: return False isDeep = len(split_field( query["from"].name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT if not isDeep: return False # BETTER TO USE TERM QUERY return True
def frum(self, fromPath, sourceVar, loopVariablePrefix): """ indexName NAME USED TO REFER TO HIGH LEVEL DOCUMENT loopVariablePrefix PREFIX FOR LOOP VARIABLES """ loopCode = "if (<PATH> != null){ for(<VAR> : <PATH>){\n<CODE>\n}}\n" self.prefixMap = [] code = "<CODE>" path = split_field(fromPath) # ADD LOCAL VARIABLES from pyLibrary.queries.es09.util import INDEX_CACHE columns = INDEX_CACHE[path[0]].columns for i, c in enumerate(columns): if c.name == "attachments": Log.debug("") if c.name.find("\\.") >= 0: self.prefixMap.insert( 0, { "path": c.name, "variable": "get(" + sourceVar + ", \"" + c.name.replace("\\.", ".") + "\")" }) else: self.prefixMap.insert(0, { "path": c.name, "variable": sourceVar + ".?" + c.name }) # ADD LOOP VARIABLES currPath = [] # self.prefixMap.insert(0, {"path": path[0], "variable": path[0]}) for i, step in enumerate(path[1::]): loopVariable = loopVariablePrefix + str(i) currPath.append(step) pathi = ".".join(currPath) shortPath = self._translate(pathi) self.prefixMap.insert(0, {"path": pathi, "variable": loopVariable}) loop = loopCode.replace("<VAR>", loopVariable).replace("<PATH>", shortPath) code = code.replace("<CODE>", loop) return code
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for k, new_value in constants.leaves(): errors = [] try: old_value = dot.set_attr(sys.modules, k, new_value) continue except Exception, e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("/", ".") path = split_field(k) for i, p in enumerate(path): if i == 0: continue prefix = join_field(path[:1]) name = join_field(path[i:]) if caller_module.endswith(prefix): old_value = dot.set_attr(caller_globals, name, new_value) if DEBUG: from pyLibrary.debugs.logs import Log Log.note("Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module= prefix, attribute= name, old_value= old_value, new_value= new_value) break except Exception, e: errors.append[e]
def __getitem__(self, key): if key == None: return Null if isinstance(key, str): key = key.decode("utf8") d = self if key.find(".") >= 0: seq = split_field(key) for n in seq: d = _getdefault(self, n) return wrap(d) else: o = dict.get(d, None) if o == None: return NullType(d, key) return wrap(o)
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap({ "filter": { "and": [ f0, { "nested": { "path": join_field(sub_path), "filter": f1, "inner_hits": { "size": 100000 } } } ] }, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0, f1]) else: f0 = {} output = wrap({ "query": { "filtered": { "filter": f0 } }, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0])