def __exit__(self, a, b, c): if self.waiting: if DEBUG: _Log.note("signaling {{num}} waiters", num=len(self.waiting)) waiter = self.waiting.pop() waiter.go() self.lock.release()
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = ( "INSERT INTO " + self.quote_column(table_name) + "(" + ",".join([self.quote_column(k) for k in columns]) + ") VALUES " + ",\n".join([ sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns])) for r in records ]) ) self.execute(command) except Exception as e: Log.error("problem with insert", e)
def jx_sort_to_es_sort(sort, schema): if not sort: return [] output = [] for s in sort: if isinstance(s.value, Variable): cols = schema.leaves(s.value.var) if s.sort == -1: types = OBJECT, STRING, NUMBER, BOOLEAN else: types = BOOLEAN, NUMBER, STRING, OBJECT for type in types: for c in cols: if c.jx_type == type: if s.sort == -1: output.append({c.es_column: "desc"}) else: output.append(c.es_column) else: from mo_logs import Log Log.error("do not know how to handle") return output
def _normalize_groupby(groupby, limit, schema=None): if groupby == None: return None output = wrap([n for ie, e in enumerate(listwrap(groupby)) for n in _normalize_group(e, ie, limit, schema=schema) ]) if any(o==None for o in output): Log.error("not expected") return output
def execute( self, command, param=None, retry=True # IF command FAILS, JUST THROW ERROR ): if param: command = expand_template(command, self.quote_param(param)) output = None done = False while not done: try: with self.locker: if not self.connection: self._connect() with Closer(self.connection.cursor()) as curs: curs.execute(command) if curs.rowcount >= 0: output = curs.fetchall() self.connection.commit() done = True except Exception as e: with suppress_exception: self.connection.rollback() # TODO: FIGURE OUT WHY rollback() DOES NOT HELP self.connection.close() self.connection = None self._connect() if not retry: Log.error("Problem with command:\n{{command|indent}}", command= command, cause=e) return output
def Stats2ZeroMoment(stats): # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html # ADDED count mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0) mz0 = mc0 mz1 = mc1 * mc0 mz2 = (mc2 + mc1 * mc1) * mc0 mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0 # 3rd non-central moment mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0 m = ZeroMoment(mz0, mz1, mz2, mz3, mz4) if DEBUG: from mo_testing.fuzzytestcase import assertAlmostEqualValue globals()["DEBUG"] = False try: v = ZeroMoment2Stats(m) assertAlmostEqualValue(v.count, stats.count, places=10) assertAlmostEqualValue(v.mean, stats.mean, places=10) assertAlmostEqualValue(v.variance, stats.variance, places=10) assertAlmostEqualValue(v.skew, stats.skew, places=10) assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10) except Exception as e: v = ZeroMoment2Stats(m) Log.error("programmer error") globals()["DEBUG"] = True return m
def problem_serializing(value, e=None): """ THROW ERROR ABOUT SERIALIZING """ from mo_logs import Log try: typename = type(value).__name__ except Exception: typename = "<error getting name>" try: rep = text_type(repr(value)) except Exception as _: rep = None if rep == None: Log.error( "Problem turning value of type {{type}} to json", type=typename, cause=e ) else: Log.error( "Problem turning value ({{value}}) of type {{type}} to json", value=rep, type=typename, cause=e )
def output(*args, **kwargs): if len(args): if len(kwargs.keys()): Log.error("Not allowed to use both args and kwargs") return self._execute({item: args}) else: return self._execute({item: kwargs})
def _dict2json(value, sub_schema, path, net_new_properties, buffer): prefix = '{' for k, v in sort_using_key(value.items(), lambda r: r[0]): if v == None or v == '': continue append(buffer, prefix) prefix = COMMA if is_binary(k): k = utf82unicode(k) if not is_text(k): Log.error("Expecting property name to be a string") if k not in sub_schema: sub_schema[k] = {} net_new_properties.append(path + [k]) append(buffer, encode_basestring(encode_property(k))) append(buffer, COLON) typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer) if prefix is COMMA: append(buffer, COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') else: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}')
def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def _worker(start): output = SchemaTree() root = parquet_schema_list[off.set] output.element = root max = start + coalesce(root.num_children, 0) if off.set == 0: if root.name not in ['.', 'schema', 'spark_schema', 'm', 'hive_schema', 'root']: # some known root names Log.warning("first SchemaElement is given name {{name|quote}}, name is ignored", name=root.name) root.name = '.' root.repetition_type = REQUIRED while off.set < max: off.set += 1 child = _worker(off.set) parent = output path = relative_field(child.element.name, root.name) # path = split_field(relative_field(child.element.name, root.name)) # for i, p in enumerate(path[:-1]): # new_parent = parent.more[p] = SchemaTree() # new_parent.element = SchemaElement( # name=concat_field(root.name, join_field(path[:i+1])), # repetition_type=REQUIRED # ) # parent = new_parent # parent.more[path[-1]] = child parent.more[path] = child return output
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def _reader(self, name, pipe, recieve, please_stop): try: line = "dummy" while not please_stop and self.service.returncode is None and line: line = pipe.readline().rstrip() if line: recieve.add(line) if self.debug: Log.note("{{process}} ({{name}}): {{line}}", name=name, process=self.name, line=line) # GRAB A FEW MORE LINES max = 100 while max: try: line = pipe.readline().rstrip() if line: max = 100 recieve.add(line) if self.debug: Log.note("{{process}} ({{name}}): {{line}}", name=name, process=self.name, line=line) else: max -= 1 except Exception: break finally: pipe.close() recieve.add(THREAD_STOP)
def _select_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): try: record[field.name] = field.value(wrap(v)) except Exception as e: record[field.name] = None return 0, None for i, f in enumerate(field.value[depth : len(field.value) - 1 :]): v = v.get(f) if v is None: return 0, None if is_list(v): return depth + i + 1, v f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE record[field.name] = v else: record[field.name] = v.get(f) except Exception as e: Log.error( "{{value}} does not have {{field}} property", value=v, field=f, cause=e ) return 0, None
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE: return data if isinstance(data, Container): return data.filter(where) if is_container(data): temp = jx_expression_to_function(where) dd = wrap(data) return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)]) else: Log.error( "Do not know how to handle type {{type}}", type=data.__class__.__name__ ) try: return drill_filter(where, data) except Exception as _: # WOW! THIS IS INEFFICIENT! return wrap( [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])] )
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def _select(template, data, fields, depth): output = FlatList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if d.__class__ is Data: Log.error("programmer error, _select can not handle Data, only dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: try: gc.collect() end_memory = self.process.memory_info().rss net_memory = end_memory-self.start_memory if net_memory > 100 * 1000 * 1000: Log.warning( "MEMORY WARNING (additional {{net_memory|comma}}bytes): "+self.description, default_params=self.params, net_memory=net_memory ) from pympler import summary from pympler import muppy sum1 = sorted(summary.summarize(muppy.get_objects()), key=lambda r: -r[2])[:30] Log.warning("{{data}}", data=sum1) elif end_memory > 1000*1000*1000: Log.warning( "MEMORY WARNING (over {{end_memory|comma}}bytes): "+self.description, default_params=self.params, end_memory=end_memory ) from pympler import summary from pympler import muppy sum1 = sorted(summary.summarize(muppy.get_objects()), key=lambda r: -r[2])[:30] Log.warning("{{data}}", data=sum1) except Exception as e: Log.warning("problem in memory measure", cause=e)
def write_profiles(main_thread_profile): if cprofiler_stats is None: return from pyLibrary import convert from mo_files import File cprofiler_stats.add(pstats.Stats(main_thread_profile.cprofiler)) stats = cprofiler_stats.pop_all() Log.note("aggregating {{num}} profile stats", num=len(stats)) acc = stats[0] for s in stats[1:]: acc.add(s) stats = [ { "num_calls": d[1], "self_time": d[2], "total_time": d[3], "self_time_per_call": d[2] / d[1], "total_time_per_call": d[3] / d[1], "file": (f[0] if f[0] != "~" else "").replace("\\", "/"), "line": f[1], "method": f[2].lstrip("<").rstrip(">") } for f, d, in iteritems(acc.stats) ] stats_file = File(FILENAME, suffix=convert.datetime2string(datetime.now(), "_%Y%m%d_%H%M%S")) stats_file.write(convert.list2tab(stats)) Log.note("profile written to {{filename}}", filename=stats_file.abspath)
def _decode(index, parent_path, path, name2index, expected_vars=NO_VARS): c, index = skip_whitespace(index) if not path: if c != b"[": # TREAT VALUE AS SINGLE-VALUE ARRAY yield _decode_token(index, c, parent_path, path, name2index, None, expected_vars) else: c, index = skip_whitespace(index) if c == b']': return # EMPTY ARRAY while True: value, index = _decode_token(index, c, parent_path, path, name2index, None, expected_vars) c, index = skip_whitespace(index) if c == b']': yield value, index return elif c == b',': c, index = skip_whitespace(index) yield value, index else: if c != b'{': Log.error("Expecting all objects to at least have {{path}}", path=path[0]) for j, i in _decode_object(index, parent_path, path, name2index, expected_vars=expected_vars): yield j, i
def _decode_token(index, c, full_path, path, name2index, destination, expected_vars): if c == b'{': if not expected_vars: index = jump_to_end(index, c) value = None elif expected_vars[0] == ".": json.mark(index-1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: count = 0 for v, i in _decode_object(index, full_path, path, name2index, destination, expected_vars=expected_vars): index = i value = v count += 1 if count != 1: Log.error("Expecting object, nothing nested") elif c == b'[': if not expected_vars: index = jump_to_end(index, c) value = None else: json.mark(index - 1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: if expected_vars and expected_vars[0] == ".": value, index = simple_token(index, c) else: index = jump_to_end(index, c) value = None return value, index
def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def select(self, selectList, fromPath, varName, sourceVar): path = split_field(fromPath) is_deep = len(path) > 1 heads = [] list = [] for s in selectList: if is_deep: if s.value and is_variable_name(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: if s.value and is_variable_name(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: code, decode = self.Parts2Term(s.domain) heads.append(code.head) list.append("Value2Pipe(" + code.body + ")\n") if len(split_field(fromPath)) > 1: output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' else: output = varName + ' = ' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' return Data( head="".join(heads), body=output )
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = wrap(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d,) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def utf82unicode(value): """ WITH EXPLANATION FOR FAILURE """ try: return value.decode("utf8") except Exception as e: if not _Log: _late_import() if not is_binary(value): _Log.error("Can not convert {{type}} to unicode because it's not bytes", type= type(value).__name__) e = _Except.wrap(e) for i, c in enumerate(value): try: c.decode("utf8") except Exception as f: _Log.error("Can not convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)]) try: latin1 = text_type(value.decode("latin1")) _Log.error("Can not explain conversion failure, but seems to be latin1", e) except Exception: pass try: a = text_type(value.decode("latin1")) _Log.error("Can not explain conversion failure, but seems to be latin1", e) except Exception: pass _Log.error("Can not explain conversion failure of " + type(value).__name__ + "!", e)
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": json.mark(index-1) index = jump_to_end(index, c) value = wrap(json_decoder(json.release(index).decode("utf8"))) return value, index elif c == b"t" and json.slice(index, index + 3) == b"rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == b"ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == b"alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 text = json.release(index) try: return float(text), index except Exception: Log.error("Not a known JSON primitive: {{text|quote}}", text=text)
def __getitem__(self, index): offset = index - self.start if offset < len(self.buffer): return self.buffer[offset:offset + 1] if offset < 0: Log.error("Can not go in reverse on stream index=={{index}} (offset={{offset}})", index=index, offset=offset) if self._mark == -1: self.start += self.buffer_length offset = index - self.start self.buffer = self.get_more() self.buffer_length = len(self.buffer) while self.buffer_length <= offset: more = self.get_more() self.buffer += more self.buffer_length = len(self.buffer) return self.buffer[offset:offset+1] needless_bytes = self._mark - self.start if needless_bytes: self.start = self._mark offset = index - self.start self.buffer = self.buffer[needless_bytes:] self.buffer_length = len(self.buffer) while self.buffer_length <= offset: more = self.get_more() self.buffer += more self.buffer_length = len(self.buffer) try: return self.buffer[offset:offset+1] except Exception as e: Log.error("error", cause=e)
def replacePrefix(value, prefix, new_prefix): try: if value.startswith(prefix): return new_prefix+value[len(prefix)::] return value except Exception as e: Log.error("can not replace prefix", e)
def _decode_object_items(index, c, parent_path, query_path, expected_vars): """ ITERATE THROUGH THE PROPERTIES OF AN OBJECT """ c, index = skip_whitespace(index) num_items = 0 while True: if c == b',': c, index = skip_whitespace(index) elif c == b'"': name, index = simple_token(index, c) if "name" in expected_vars: for i, e in enumerate(expected_vars): if e == "name": destination[i] = name c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed("value", expected_vars) index = _assign_token(index, c, child_expected) c, index = skip_whitespace(index) DEBUG and not num_items % 1000 and Log.note("{{num}} items iterated", num=num_items) yield index num_items += 1 elif c == b"}": break
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": Log.error("Expecting a primitive value") elif c == b"t" and json.slice(index, index + 3) == "rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == "ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == "alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 return float(json.release(index)), index
def _value2json(value, _buffer): try: _class = value.__class__ if value is None: append(_buffer, u"null") return elif value is True: append(_buffer, u"true") return elif value is False: append(_buffer, u"false") return type = value.__class__ if type is binary_type: append(_buffer, QUOTE) try: v = utf82unicode(value) except Exception as e: problem_serializing(value, e) for c in v: append(_buffer, ESCAPE_DCT.get(c, c)) append(_buffer, QUOTE) elif type is text_type: append(_buffer, QUOTE) for c in value: append(_buffer, ESCAPE_DCT.get(c, c)) append(_buffer, QUOTE) elif type is dict: if not value: append(_buffer, u"{}") else: _dict2json(value, _buffer) return elif type is Data: d = _get(value, SLOT) # MIGHT BE A VALUE NOT A DICT _value2json(d, _buffer) return elif type in (int, long, Decimal): append(_buffer, text_type(value)) elif type is float: if math.isnan(value) or math.isinf(value): append(_buffer, u'null') else: append(_buffer, float2json(value)) elif type in (set, list, tuple, FlatList): _list2json(value, _buffer) elif type is date: append(_buffer, float2json(time.mktime(value.timetuple()))) elif type is datetime: append(_buffer, float2json(time.mktime(value.timetuple()))) elif type is Date: append(_buffer, float2json(value.unix)) elif type is timedelta: append(_buffer, float2json(value.total_seconds())) elif type is Duration: append(_buffer, float2json(value.seconds)) elif type is NullType: append(_buffer, u"null") elif is_data(value): if not value: append(_buffer, u"{}") else: _dict2json(value, _buffer) return elif hasattr(value, '__data__'): d = value.__data__() _value2json(d, _buffer) elif hasattr(value, '__json__'): j = value.__json__() append(_buffer, j) elif hasattr(value, '__iter__'): _iter2json(value, _buffer) else: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable") except Exception as e: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable", cause=e)
def get_table(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self
def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: mark_as_deleted(c) self.todo.add(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col) self.todo.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add(col) except Exception as e: Log.error("should not happen", cause=e)
def __truediv__(self, other): if not isinstance(other, text_type): Log.error(u"Expecting text path") output = self.__copy__() output.path = output.path.rstrip('/') + "/" + other.lstrip('/') return output
def _get_single_branch_from_hg(settings, description, dir): if dir == "users": return [] response = http.get(settings.url + "/" + dir) doc = BeautifulSoup(response.all_content, "html.parser") output = [] try: all_branches = doc("table")[0] except Exception: return [] for i, b in enumerate(all_branches("tr")): if i == 0: continue # IGNORE HEADER columns = b("td") try: path = columns[0].a.get('href') if path == "/": continue name, desc, last_used = [c.text.strip() for c in columns][0:3] if last_used.startswith('at'): last_used = last_used[2:] detail = Data( name=name.lower(), locale=DEFAULT_LOCALE, parent_name=description, url=settings.url + path, description=desc, last_used=Date(last_used), etl={"timestamp": Date.now()} ) if detail.description == "unknown": detail.description = None # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR if path in [ "/projects/dxr/", # moved to webtools "/build/compare-locales/", # ?build team likes to clone? "/build/puppet/", # ?build team likes to clone? "/SeaMonkey/puppet/", # looses the popularity contest "/releases/gaia-l10n/v1_2/en-US/", # use default branch "/releases/gaia-l10n/v1_3/en-US/", # use default branch "/releases/gaia-l10n/v1_4/en-US/", # use default branch "/releases/gaia-l10n/v2_0/en-US/", # use default branch "/releases/gaia-l10n/v2_1/en-US/", # use default branch "/build/autoland/" ]: continue # MARKUP BRANCH IF LOCALE SPECIFIC if path.startswith("/l10n-central"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "mozilla-central" elif path.startswith("/releases/l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = _path[-2].lower() elif path.startswith("/releases/gaia-l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "gaia-" + _path[-2][1::] elif path.startswith("/weave-l10n"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "weave" if BRANCH_WHITELIST is not None: found = False for br in BRANCH_WHITELIST: if br in str(detail.name): found = True break if not found: continue Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale) output.append(detail) except Exception as e: Log.warning("branch digestion problem", cause=e) return output
def value(self): if self.num: Log.error("can not get value of with dimension") return self.cube
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif is_data(value): try: items = sort_using_key(value.items(), lambda r: r[0]) values = [ encode_basestring(k) + PRETTY_COLON + pretty_json(v) for k, v in items if v != None ] if not values: return "{}" elif len(values) == 1: return "{" + values[0] + "}" else: return "{\n" + ",\n".join(indent(v) for v in values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not is_text(k) for k in value.keys()): Log.error("JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e) Log.error("problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e) elif value in (None, Null): return "null" elif value.__class__ in (binary_type, text_type): if is_binary(value): value = utf82unicode(value) try: if "\n" in value and value.strip(): return pretty_json({ "$concat": value.split("\n"), "separator": "\n" }) else: return quote(value) except Exception as e: from mo_logs import Log try: Log.note( "try explicit convert of string with length {{length}}", length=len(value)) acc = [QUOTE] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = text_type(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(QUOTE) output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException as f: Log.warning("can not convert {{type}} to json", type=f.__class__.__name__, cause=f) return "null" elif is_list(value): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max( *[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max( 1, min( ARRAY_MAX_COLUMNS, int( floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + PRETTY_COMMA.join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join( PRETTY_COMMA.join( j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns)) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning( "problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p)) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e) elif hasattr(value, '__data__'): d = value.__data__() return pretty_json(d) elif hasattr(value, '__json__'): j = value.__json__() if j == None: return " null " # TODO: FIND OUT WHAT CAUSES THIS return pretty_json(json_decoder(j)) elif scrub(value) is None: return "null" elif hasattr(value, '__iter__'): return pretty_json(list(value)) elif hasattr(value, '__call__'): return "null" else: try: if int(value) == value: return text_type(int(value)) except Exception: pass try: if float(value) == value: return text_type(float(value)) except Exception: pass return pypy_json_encode(value) except Exception as e: problem_serializing(value, e)
def typed_encode(value, sub_schema, path, net_new_properties, buffer): """ :param value: THE DATA STRUCTURE TO ENCODE :param sub_schema: dict FROM PATH TO Column DESCRIBING THE TYPE :param path: list OF CURRENT PATH :param net_new_properties: list FOR ADDING NEW PROPERTIES NOT FOUND IN sub_schema :param buffer: UnicodeBuilder OBJECT :return: """ try: # from jx_base import Column if sub_schema.__class__.__name__ == 'Column': value_json_type = python_type_to_json_type[value.__class__] column_json_type = es_type_to_json_type[sub_schema.es_type] if value_json_type == column_json_type: pass # ok elif value_json_type == NESTED and all( python_type_to_json_type[v.__class__] == column_json_type for v in value if v != None): pass # empty arrays can be anything else: from mo_logs import Log Log.error("Can not store {{value}} in {{column|quote}}", value=value, column=sub_schema.name) sub_schema = { json_type_to_inserter_type[value_json_type]: sub_schema } if value == None: from mo_logs import Log Log.error("can not encode null (missing) values") elif value is True: if BOOLEAN_TYPE not in sub_schema: sub_schema[BOOLEAN_TYPE] = {} net_new_properties.append(path + [BOOLEAN_TYPE]) append(buffer, '{') append(buffer, QUOTED_BOOLEAN_TYPE) append(buffer, 'true}') return elif value is False: if BOOLEAN_TYPE not in sub_schema: sub_schema[BOOLEAN_TYPE] = {} net_new_properties.append(path + [BOOLEAN_TYPE]) append(buffer, '{') append(buffer, QUOTED_BOOLEAN_TYPE) append(buffer, 'false}') return _type = value.__class__ if _type in (dict, Data): if sub_schema.__class__.__name__ == 'Column': from mo_logs import Log Log.error("Can not handle {{column|json}}", column=sub_schema) if NESTED_TYPE in sub_schema: # PREFER NESTED, WHEN SEEN BEFORE if value: append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[') _dict2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, ']' + COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, text_type(len(value))) append(buffer, '}') else: # SINGLETON LIST append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}]') append(buffer, COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') else: if EXISTS_TYPE not in sub_schema: sub_schema[EXISTS_TYPE] = {} net_new_properties.append(path + [EXISTS_TYPE]) if value: _dict2json(value, sub_schema, path, net_new_properties, buffer) else: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') elif _type is binary_type: if STRING_TYPE not in sub_schema: sub_schema[STRING_TYPE] = True net_new_properties.append(path + [STRING_TYPE]) append(buffer, '{') append(buffer, QUOTED_STRING_TYPE) append(buffer, '"') try: v = utf82unicode(value) except Exception as e: raise problem_serializing(value, e) for c in v: append(buffer, ESCAPE_DCT.get(c, c)) append(buffer, '"}') elif _type is text_type: if STRING_TYPE not in sub_schema: sub_schema[STRING_TYPE] = True net_new_properties.append(path + [STRING_TYPE]) append(buffer, '{') append(buffer, QUOTED_STRING_TYPE) append(buffer, '"') for c in value: append(buffer, ESCAPE_DCT.get(c, c)) append(buffer, '"}') elif _type in integer_types: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, text_type(value)) append(buffer, '}') elif _type in (float, Decimal): if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value)) append(buffer, '}') elif _type in (set, list, tuple, FlatList): if len(value) == 0: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '0}') elif any(v.__class__ in (Data, dict, set, list, tuple, FlatList) for v in value): # THIS IS NOT DONE BECAUSE if len(value) == 1: if NESTED_TYPE in sub_schema: append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _list2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: # NO NEED TO NEST, SO DO NOT DO IT typed_encode(value[0], sub_schema, path, net_new_properties, buffer) else: if NESTED_TYPE not in sub_schema: sub_schema[NESTED_TYPE] = {} net_new_properties.append(path + [NESTED_TYPE]) append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _list2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: # ALLOW PRIMITIVE MULTIVALUES value = [v for v in value if v != None] types = list( set(json_type_to_inserter_type[python_type_to_json_type[ v.__class__]] for v in value)) if len(types) == 0: # HANDLE LISTS WITH Nones IN THEM append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[]}') elif len(types) > 1: _list2json(value, sub_schema, path + [NESTED_TYPE], net_new_properties, buffer) else: element_type = types[0] if element_type not in sub_schema: sub_schema[element_type] = True net_new_properties.append(path + [element_type]) append(buffer, '{') append(buffer, quote(element_type)) append(buffer, COLON) _multivalue2json(value, sub_schema[element_type], path + [element_type], net_new_properties, buffer) append(buffer, '}') elif _type is date: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(time.mktime(value.timetuple()))) append(buffer, '}') elif _type is datetime: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(time.mktime(value.timetuple()))) append(buffer, '}') elif _type is Date: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.unix)) append(buffer, '}') elif _type is timedelta: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.total_seconds())) append(buffer, '}') elif _type is Duration: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.seconds)) append(buffer, '}') elif _type is NullType: append(buffer, 'null') elif hasattr(value, '__data__'): typed_encode(value.__data__(), sub_schema, path, net_new_properties, buffer) elif hasattr(value, '__iter__'): if NESTED_TYPE not in sub_schema: sub_schema[NESTED_TYPE] = {} net_new_properties.append(path + [NESTED_TYPE]) append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _iter2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable") except Exception as e: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable", cause=e)
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col)) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.multi <= 1 and col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def __init__(self, term): Log.error("Should never happen!")
def append_query(self, query_path, es_query): Log.error("Not supported")
def __enter__(self): if _Log.cprofiler: _Log.note("starting cprofile") self.cprofiler = cProfile.Profile() self.cprofiler.enable()
def format(self, format): if format == None or format == "cube": return self else: Log.error("Do not know how to handle")
def get_index(self, row, es_query=None, index=None): try: key = row[0].get('key') return self.domain.getIndexByKey(key) except Exception as e: Log.error("problem", cause=e)
def pypi(self): if Date.today() <= self.last_deploy(): Log.note("Can not upload to pypi") return False lib_name = self.directory.name source_readme = File.new_instance(self.directory, 'README.md').abspath dest_readme = File.new_instance(self.directory, 'README.txt').abspath pypandoc.convert(source_readme, to=b'rst', outputfile=dest_readme) setup_file = File.new_instance(self.directory, 'setup.py') req_file = File.new_instance(self.directory, 'requirements.txt') if not setup_file.exists: Log.warning("Not a PyPi project! No setup.py file.") setup = setup_file.read() # UPDATE THE VERSION NUMBER curr = (datetime.datetime.utcnow() + datetime.timedelta(days=1)).strftime("%y%j") setup = re.sub(r'(version\s*=\s*\"\d*\.\d*\.)\d*(\")', r'\g<1>%s\2' % curr, setup) # UPDATE THE REQUIREMENTS if not req_file.exists: Log.error("Expecting a requirements.txt file") req = req_file.read() setup_req = re.findall(r'install_requires\s*=\s*\[.*\]\s*,', setup) setup.replace( setup_req[0], 'install_requires=' + value2json( d for d in sorted(map(strings.trim, req.split("\n"))) if d)) setup_file.write(setup) File.new_instance(self.directory, "build").delete() File.new_instance(self.directory, "dist").delete() File.new_instance(self.directory, lib_name.replace("-", "_") + ".egg-info").delete() process, stdout, stderr = self.local( "pypi", ["C:/Python27/python.exe", "setup.py", "bdist_egg", "upload"], raise_on_error=False) if "Upload failed (400): File already exists." in stderr: Log.warning("Not uploaded") elif process.returncode == 0: pass else: Log.error("not expected") process, stdout, stderr = self.local( "pypi", ["C:/Python27/python.exe", "setup.py", "sdist", "upload"], raise_on_error=False) if "Upload failed (400): File already exists." in stderr: Log.warning("Not uploaded") elif process.returncode == 0: pass else: Log.error("not expected") File.new_instance(self.directory, "README.txt").delete() File.new_instance(self.directory, "build").delete() File.new_instance(self.directory, "dist").delete() File.new_instance(self.directory, lib_name.replace("-", "_") + ".egg-info").delete() return True
elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = SQLang[p.where].to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = SQLang[query_edge.range.min].to_sql(schema)[0].sql.items() + SQLang[query_edge.range.max].to_sql(schema)[ 0].sql.items() else: Log.error("Do not know how to handle") edge_names = [] for column_index, (sql_type, sql) in enumerate(edge_values): sql_name = "e" + text(edge_index) + "c" + text(column_index) edge_names.append(sql_name) num_sql_columns = len(index_to_column) if not query_edge.value and any(query_edge.domain.partitions.where): def __(parts, num_sql_columns): def _get(row): return parts[row[num_sql_columns]].name return _get pull = __(query_edge.domain.partitions, num_sql_columns)
def test_generator(self): test = {"value": (x for x in [])} output = value2json(test) if output != u'{"value":[]}': Log.error("expecting correct value")
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): for s in many: es_cols = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_count") if es_col.type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {es_col.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = es_cols[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for es_col in es_cols: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(es_col.es_column)+'].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(es_col.es_column) if es_col.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": es_col.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_painless(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_painless(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_painless(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_painless(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_painless(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": schema.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def test_double4(self): v = 1.99999999999 test = {"value": v} output = value2json(test) if output != u'{"value":2}': Log.error("expecting correct value")
defs=[{ "name": ["--all", "-a"], "action": 'store_true', "help": 'process all mo-* subdirectories', "dest": "all", "required": False }, { "name": ["--dir", "--directory", "-d"], "help": 'directory to deploy', "type": str, "dest": "directory", "required": True, "default": "." }]) constants.set(settings.constants) Log.start(settings.debug) if settings.args.all: deploy_all(File(settings.args.directory), settings.prefix, settings) else: Deploy(File(settings.args.directory), kwargs=settings).deploy() except Exception, e: Log.warning("Problem with etl", cause=e) finally: Log.stop() if __name__ == "__main__": main()
def test_double1(self): test = {"value":5.2025595183536973e-07} output = value2json(test) if output != u'{"value":5.202559518353697e-7}': Log.error("expecting correct value")
def test_double5(self): v = 1.00000000001 test = {"value": v} output = value2json(test) if output != u'{"value":1}': Log.error("expecting correct value")
def test_unicode2(self): output = value2json({"comment": "testing accented char àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"}) assert output == u'{"comment":"testing accented char àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"}' if not isinstance(output, text): Log.error("expecting text json")
def test_double3(self): test = {"value": .52} output = value2json(test) if output != u'{"value":0.52}': Log.error("expecting correct value")
def test_date(self): output = value2json({"test": datetime.date(2013, 11, 13)}) Log.note("JSON = {{json}}", json= output)
def test_unicode3(self): output = value2json({"comment": u"testing accented char ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"}) assert output == u'{"comment":"testing accented char ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"}' if not isinstance(output, text): Log.error("expecting unicode json")
def to_es_script(self, schema, not_null=False, boolean=False, many=True): Log.error("not supported")
def test_unicode1(self): output = value2json({"comment": u"Open all links in the current tab, except the pages opened from external apps — open these ones in new windows"}) assert output == u'{"comment":"Open all links in the current tab, except the pages opened from external apps — open these ones in new windows"}' if not isinstance(output, text): Log.error("expecting unicode json")
def _groupby_op(self, query, frum): schema = self.sf.tables[join_field(split_field(frum)[1:])].schema index_to_column = {} nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = join_field( [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias previous = tables[0] for t in tables[1::]: from_sql += (SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + " " + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID)) selects = [] groupby = [] for i, e in enumerate(query.groupby): for edge_sql in e.value.to_sql(schema): column_number = len(selects) sql_type, sql = edge_sql.sql.items()[0] if sql is SQL_NULL and not e.value.var in schema.keys(): Log.error("No such column {{var}}", var=e.value.var) column_alias = _make_column_name(column_number) groupby.append(sql) selects.append(sql_alias(sql, column_alias)) if edge_sql.nested_path == ".": select_name = edge_sql.name else: select_name = "." index_to_column[column_number] = ColumnMapping( is_edge=True, push_name=e.name, push_column_name=e.name.replace("\\.", "."), push_column=i, push_child=select_name, pull=get_column(column_number), sql=sql, column_alias=column_alias, type=sql_type_to_json_type[sql_type]) for i, select in enumerate(listwrap(query.select)): column_number = len(selects) sql_type, sql = select.value.to_sql(schema)[0].sql.items()[0] if sql == 'NULL' and not select.value.var in schema.keys(): Log.error("No such column {{var}}", var=select.value.var) if select.value == "." and select.aggregate == "count": selects.append( sql_alias(sql_count(SQL_ONE), quote_column(select.name))) else: selects.append( sql_alias(sql_aggs[select.aggregate] + sql_iso(sql), quote_column(select.name))) index_to_column[column_number] = ColumnMapping( push_name=select.name, push_column_name=select.name, push_column=i + len(query.groupby), push_child=".", pull=get_column(column_number), sql=sql, column_alias=quote_column(select.name), type=sql_type_to_json_type[sql_type]) for w in query.window: selects.append(self._window_op(self, query, w)) where = query.where.to_sql(schema)[0].sql.b command = (SQL_SELECT + (sql_list(selects)) + SQL_FROM + from_sql + SQL_WHERE + where + SQL_GROUPBY + sql_list(groupby)) if query.sort: command += SQL_ORDERBY + sql_list( sql_iso(sql[t]) + SQL_IS_NULL + "," + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(schema)[0].sql) for s in query.sort] for t in "bns" if sql[t]) return command, index_to_column
def test_inf(self): test = float("+inf") output = value2json(test) expecting = cPythonJSONEncoder().encode(mo_json.scrub(test)) self.assertEqual(output, expecting, "expecting " + expecting) def test_minus_inf(self): test = float("-inf") output = value2json(test) expecting = cPythonJSONEncoder().encode(mo_json.scrub(test)) self.assertEqual(output, expecting, "expecting " + expecting) def test_string_stripper(self): test = {"hello": " world"} mo_json.FIND_LOOPS = True self.assertEqual(value2json(test), '{"hello":" world"}') def test_json_is_unicode(self): self.assertIsInstance(value2json({}), text) def test_json_encode_slash(self): self.assertEqual(value2json("/"), '"/"') if __name__ == '__main__': try: Log.start() unittest.main() finally: Log.stop()