def _convert_edge(self, edge): if isinstance(edge, basestring): return Dict( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Dict( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def safe_size(source): """ READ THE source UP TO SOME LIMIT, THEN COPY TO A FILE IF TOO BIG RETURN A str() OR A FileString() """ if source is None: return None total_bytes = 0 bytes = [] b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) bytes.append(b) if total_bytes > MAX_STRING_SIZE: try: data = FileString(TemporaryFile()) for bb in bytes: data.write(bb) del bytes del bb b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) data.write(b) b = source.read(MIN_READ_SIZE) data.seek(0) Log.note("Using file of size {{length}} instead of str()", length= total_bytes) return data except Exception, e: Log.error("Could not write file > {{num}} bytes", num= total_bytes, cause=e) b = source.read(MIN_READ_SIZE)
def groupby_size(data, size): if hasattr(data, "next"): iterator = data elif hasattr(data, "__iter__"): iterator = data.__iter__() else: Log.error("do not know how to handle this type") done = DictList() def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output # THIS IS LAZY i = 0 while True: output = more() yield (i, output) if len(done) > 0: break i += 1
def groupby_Multiset(data, min_size, max_size): # GROUP multiset BASED ON POPULATION OF EACH KEY, TRYING TO STAY IN min/max LIMITS if min_size == None: min_size = 0 total = 0 i = 0 g = list() for k, c in data.items(): if total < min_size or total + c < max_size: total += c g.append(k) elif total < max_size: yield (i, g) i += 1 total = c g = [k] if total >= max_size: Log.error("({{min}}, {{max}}) range is too strict given step of {{increment}}", min=min_size, max=max_size, increment=c ) if g: yield (i, g)
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Container): return data.groupby(keys) try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(v)) return _output() except Exception, e: Log.error("Problem grouping", e)
def execute(self, requests): """ RETURN A GENERATOR THAT HAS len(requests) RESULTS (ANY ORDER) EXPECTING requests TO BE A list OF dicts, EACH dict IS USED AS kwargs TO GIVEN functions """ if not isinstance(requests, (list, tuple, GeneratorType, Iterable)): Log.error("Expecting requests to be a list or generator", stack_depth=1) else: requests = list(requests) # FILL QUEUE WITH WORK self.inbound.extend(requests) num = len(requests) def output(): for i in xrange(num): result = self.outbound.pop() if "exception" in result: raise result["exception"] else: yield result["response"] if self.outbound is not None: return output() else: return
def write(self, data): if not self.parent.exists: self.parent.create() with open(self._filename, "wb") as f: if isinstance(data, list) and self.key: from pyLibrary.debugs.logs import Log Log.error("list of data and keys are not supported, encrypt before sending to file") if isinstance(data, list): pass elif isinstance(data, basestring): data=[data] elif hasattr(data, "__iter__"): pass for d in data: if not isinstance(d, unicode): from pyLibrary.debugs.logs import Log Log.error("Expecting unicode data only") if self.key: f.write(crypto.encrypt(d, self.key).encode("utf8")) else: f.write(d.encode("utf8"))
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in columns]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with insert", e)
def _convert_in(op, term): if not term: Log.error("Expecting a term") if not isinstance(term, Mapping): Log.error("Expecting {{op}} to have dict value", op= op) var, val = term.items()[0] if isinstance(val, list): v2 = [vv for vv in val if vv != None] if len(v2) == 0: if len(val) == 0: return False else: return {"missing": {"field": var}} if len(v2) == 1: output = {"term": {var: v2[0]}} else: output = {"terms": {var: v2}} if len(v2) != len(val): output = {"or": [ {"missing": {"field": var}}, output ]} return output else: return {"term": term}
def __init__( self, exchange, # name of the Pulse exchange topic, # message name pattern to subscribe to ('#' is wildcard) target=None, # WILL BE CALLED WITH PULSE PAYLOADS AND ack() IF COMPLETE$ED WITHOUT EXCEPTION target_queue=None, # (aka self.queue) WILL BE FILLED WITH PULSE PAYLOADS host='pulse.mozilla.org', # url to connect, port=5671, # tcp port user=None, password=None, vhost="/", start=0, # USED AS STARTING POINT FOR ASSIGNING THE _meta.count ATTRIBUTE ssl=True, applabel=None, heartbeat=False, # True to also get the Pulse heartbeat message durable=False, # True to keep queue after shutdown serializer='json', broker_timezone='GMT', settings=None ): self.target_queue = target_queue self.pulse_target = target if (target_queue == None and target == None) or (target_queue != None and target != None): Log.error("Expecting a queue (for fast digesters) or a target (for slow digesters)") Thread.__init__(self, name="Pulse consumer for " + settings.exchange, target=self._worker) self.settings = settings settings.callback = self._got_result settings.user = coalesce(settings.user, settings.username) settings.applabel = coalesce(settings.applable, settings.queue, settings.queue_name) settings.topic = topic self.pulse = ModifiedGenericConsumer(settings, connect=True, **settings) self.count = coalesce(start, 0) self.start()
def process_test_result(source_key, source, destination, please_stop=None): path = key2path(source_key) destination.delete({"and": [ {"term": {"etl.source.id": path[1]}}, {"term": {"etl.source.source.id": path[0]}} ]}) lines = source.read_lines() keys = [] data = [] for l in lines: record = convert.json2value(l) if record._id==None: continue record.result.crash_result = None #TODO: Remove me after May 2015 keys.append(record._id) data.append({ "id": record._id, "value": record }) record._id = None if data: try: destination.extend(data) except Exception, e: if "Can not decide on index by build.date" in e: if source.bucket.name == "ekyle-test-result": # KNOWN CORRUPTION # TODO: REMOVE LATER (today = Mar2015) delete_list = source.bucket.keys(prefix=key_prefix(source_key)) for d in delete_list: source.bucket.delete_key(d) Log.error("Can not add to sink", e)
def datetime2string(value, format="%Y-%m-%d %H:%M:%S"): try: return value.strftime(format) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Can not format {{value}} with {{format}}", value=value, format=format, cause=e)
def forall(self, sql, param=None, _execute=None): assert _execute num = 0 self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql= indent(sql)) self.cursor.execute(sql) columns = tuple([utf8_to_unicode(d[0]) for d in self.cursor.description]) for r in self.cursor: num += 1 _execute(wrap(dict(zip(columns, [utf8_to_unicode(c) for c in r])))) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None except Exception, e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def get_file(ref, url): from pyLibrary.env.files import File if ref.path.startswith("~"): home_path = os.path.expanduser("~") if os.sep == "\\": home_path = "/" + home_path.replace(os.sep, "/") if home_path.endswith("/"): home_path = home_path[:-1] ref.path = home_path + ref.path[1::] elif not ref.path.startswith("/"): # CONVERT RELATIVE TO ABSOLUTE if ref.path[0] == ".": num_dot = 1 while ref.path[num_dot] == ".": num_dot += 1 parent = url.path.rstrip("/").split("/")[:-num_dot] ref.path = "/".join(parent) + ref.path[num_dot:] else: parent = url.path.rstrip("/").split("/")[:-1] ref.path = "/".join(parent) + "/" + ref.path path = ref.path if os.sep != "\\" else ref.path[1::].replace("/", "\\") try: if DEBUG: _Log.note("reading file {{path}}", path=path) content = File(path).read() except Exception, e: content = None _Log.error("Could not read file {{filename}}", filename=path, cause=e)
def __init__(self, filename, buffering=2 ** 14, suffix=None): """ YOU MAY SET filename TO {"path":p, "key":k} FOR CRYPTO FILES """ if filename == None: from pyLibrary.debugs.logs import Log Log.error("File must be given a filename") elif isinstance(filename, basestring): self.key = None if filename.startswith("~"): home_path = os.path.expanduser("~") if os.sep == "\\": home_path = home_path.replace(os.sep, "/") if home_path.endswith("/"): home_path = home_path[:-1] filename = home_path + filename[1::] self._filename = filename.replace(os.sep, "/") # USE UNIX STANDARD else: self.key = convert.base642bytearray(filename.key) self._filename = "/".join(filename.path.split(os.sep)) # USE UNIX STANDARD while self._filename.find(".../") >= 0: # LET ... REFER TO GRANDPARENT, .... REFER TO GREAT-GRAND-PARENT, etc... self._filename = self._filename.replace(".../", "../../") self.buffering = buffering if suffix: self._filename = File.add_suffix(self._filename, suffix)
def execute( self, command, param=None, retry=True # IF command FAILS, JUST THROW ERROR ): if param: command = expand_template(command, self.quote_param(param)) output = None done = False while not done: try: with self.locker: if not self.connection: self._connect() with Closer(self.connection.cursor()) as curs: curs.execute(command) if curs.rowcount >= 0: output = curs.fetchall() self.connection.commit() done = True except Exception, e: try: self.connection.rollback() # TODO: FIGURE OUT WHY rollback() DOES NOT HELP self.connection.close() except Exception, f: pass self.connection = None self._connect() if not retry: Log.error("Problem with command:\n{{command|indent}}", command= command, cause=e)
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": Log.error("Expecting a primitive value") elif c == b"t" and json.slice(index, index + 3) == "rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == "ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == "alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 return float(json.release(index)), index
def create(self): try: os.makedirs(self._filename) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Could not make directory {{dir_name}}", dir_name= self._filename, cause=e)
def _decode(index, parent_path, path, name2index, expected_vars=NO_VARS): c, index = skip_whitespace(index) if not path: if c != b"[": # TREAT VALUE AS SINGLE-VALUE ARRAY yield _decode_token(index, c, parent_path, path, name2index, None, expected_vars) else: c, index = skip_whitespace(index) if c == b']': return # EMPTY ARRAY while True: value, index = _decode_token(index, c, parent_path, path, name2index, None, expected_vars) c, index = skip_whitespace(index) if c == b']': yield value, index return elif c == b',': c, index = skip_whitespace(index) yield value, index else: if c != b'{': Log.error("Expecting all objects to at least have {{path}}", path=path[0]) for j, i in _decode_object(index, parent_path, path, name2index, expected_vars=expected_vars): yield j, i
def _decode_token(index, c, full_path, path, name2index, destination, expected_vars): if c == b'{': if not expected_vars: index = jump_to_end(index, c) value = None elif expected_vars[0] == ".": json.mark(index-1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: count = 0 for v, i in _decode_object(index, full_path, path, name2index, destination, expected_vars=expected_vars): index = i value = v count += 1 if count != 1: Log.error("Expecting object, nothing nested") elif c == b'[': if not expected_vars: index = jump_to_end(index, c) value = None else: json.mark(index - 1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: if expected_vars and expected_vars[0] == ".": value, index = simple_token(index, c) else: index = jump_to_end(index, c) value = None return value, index
def get_index(self, row): if self.computed_domain: try: part = row[self.start] return self.domain.getIndexByKey(part["key"]) except Exception, e: Log.error("problem", cause=e)
def send(self, topic, message): """Publishes a pulse message to the proper exchange.""" if not message: Log.error("Expecting a message") message._prepare() if not self.connection: self.connect() producer = Producer( channel=self.connection, exchange=Exchange(self.settings.exchange, type='topic'), routing_key=topic ) # The message is actually a simple envelope format with a payload and # some metadata. final_data = Dict( payload=message.data, _meta=set_default({ 'exchange': self.settings.exchange, 'routing_key': message.routing_key, 'serializer': self.settings.serializer, 'sent': time_to_string(datetime.datetime.now(timezone(self.settings.broker_timezone))), 'count': self.count }, message.metadata) ) producer.publish(jsons.scrub(final_data), serializer=self.settings.serializer) self.count += 1
def read_settings(filename=None, defs=None): # READ SETTINGS if filename: settings_file = File(filename) if not settings_file.exists: Log.error("Can not file settings file {{filename}}", { "filename": settings_file.abspath }) settings = ref.get("file:///" + settings_file.abspath) if defs: settings.args = argparse(defs) return settings else: defs = listwrap(defs) defs.append({ "name": ["--settings", "--settings-file", "--settings_file"], "help": "path to JSON file with settings", "type": str, "dest": "filename", "default": "./settings.json", "required": False }) args = argparse(defs) settings = ref.get("file://" + args.filename.replace(os.sep, "/")) settings.args = args return settings
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def json2value(json_string, params={}, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception, e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value
def column_query(self, sql, param=None): """ RETURN RESULTS IN [column][row_num] GRID """ self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() self.cursor.execute("SET TIME_ZONE='+00:00'") self.cursor.close() self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) grid = [[utf8_to_unicode(c) for c in row] for row in self.cursor] # columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] result = zip(*grid) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None return result except Exception, e: if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e,stack_depth=1)
def latin12unicode(value): if isinstance(value, unicode): Log.error("can not convert unicode from latin1") try: return unicode(value.decode('iso-8859-1')) except Exception, e: Log.error("Can not convert {{value|quote}} to unicode", value=value)
def quote_value(self, value): """ convert values to mysql code for the same mostly delegate directly to the mysql lib, but some exceptions exist """ try: if value == None: return "NULL" elif isinstance(value, SQL): if not value.param: # value.template CAN BE MORE THAN A TEMPLATE STRING return self.quote_sql(value.template) param = {k: self.quote_sql(v) for k, v in value.param.items()} return expand_template(value.template, param) elif isinstance(value, basestring): return self.db.literal(value) elif isinstance(value, datetime): return "str_to_date('" + value.strftime("%Y%m%d%H%M%S") + "', '%Y%m%d%H%i%s')" elif hasattr(value, '__iter__'): return self.db.literal(json_encode(value)) elif isinstance(value, Mapping): return self.db.literal(json_encode(value)) elif Math.is_number(value): return unicode(value) else: return self.db.literal(value) except Exception, e: Log.error("problem quoting SQL", e)
def string2url(value): if isinstance(value, unicode): return "".join([_map2url[c] for c in unicode2latin1(value)]) elif isinstance(value, str): return "".join([_map2url[c] for c in value]) else: Log.error("Expecting a string")
def _convert_from(self, frum): if isinstance(frum, basestring): return Dict(name=frum) elif isinstance(frum, (Container, Query)): return frum else: Log.error("Expecting from clause to be a name, or a container")
def _get_attr(obj, path): if not path: return obj attr_name = path[0] if isinstance(obj, ModuleType): if attr_name in obj.__dict__: return _get_attr(obj.__dict__[attr_name], path[1:]) elif attr_name in dir(obj): return _get_attr(obj[attr_name], path[1:]) # TRY FILESYSTEM from pyLibrary.env.files import File possible_error = None if File.new_instance(File(obj.__file__).parent, attr_name).set_extension("py").exists: try: # THIS CASE IS WHEN THE __init__.py DOES NOT IMPORT THE SUBDIR FILE # WE CAN STILL PUT THE PATH TO THE FILE IN THE from CLAUSE if len(path) == 1: # GET MODULE OBJECT output = __import__(obj.__name__ + "." + attr_name, globals(), locals(), [path[0]], 0) return output else: # GET VARIABLE IN MODULE output = __import__(obj.__name__ + "." + attr_name, globals(), locals(), [path[1]], 0) return _get_attr(output, path[1:]) except Exception, e: from pyLibrary.debugs.exceptions import Except possible_error = Except.wrap(e) # TRY A CASE-INSENSITIVE MATCH attr_name = lower_match(attr_name, dir(obj)) if not attr_name: from pyLibrary.debugs.logs import Log Log.warning(PATH_NOT_FOUND + ". Returning None.", cause=possible_error) elif len(attr_name) > 1: from pyLibrary.debugs.logs import Log Log.error(AMBIGUOUS_PATH_FOUND + " {{paths}}", paths=attr_name) else: return _get_attr(obj[attr_name[0]], path[1:])
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) settings=None): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index=settings.index) self.settings = settings self.cluster = Cluster(settings) if type == None: if not explore_metadata: Log.error( "Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now." ) indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias == self.settings.index: candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] index = qb.sort(candidates, 0).last()[1] else: index = indices[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in index.mappings.items(): num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(Thread.STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = filter( lambda c: (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.type not in ["object", "nested"], self.meta.columns) if old_columns: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.table == d.table and c != d: Log.error("") else: Log.note("no more metatdata to update") column = self.todo.pop(timeout=10 * MINUTE) if column: Log.note("update {{table}}.{{column}}", table=column.table, column=column.es_column) if column.type in ["object", "nested"]: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.table.startswith( TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception, e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception, e: Log.warning("problem in cardinality monitor", cause=e)
def datetime2unix(value): try: if value == None: return None elif isinstance(value, datetime): epoch = datetime(1970, 1, 1) diff = value - epoch return diff.total_seconds() elif isinstance(value, date): epoch = date(1970, 1, 1) diff = value - epoch return diff.total_seconds() else: from pyLibrary.debugs.logs import Log Log.error("Can not convert {{value}} of type {{type}}", value=value, type=value.__class__) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Can not convert {{value}}", value=value, cause=e)
def typed_encode(value): """ pypy DOES NOT OPTIMIZE GENERATOR CODE WELL """ try: _buffer = UnicodeBuilder(1024) _typed_encode(value, _buffer) output = _buffer.build() return output except Exception, e: # THE PRETTY JSON WILL PROVIDE MORE DETAIL ABOUT THE SERIALIZATION CONCERNS from pyLibrary.debugs.logs import Log Log.warning("Serialization of JSON problems", e) try: return pretty_json(value) except Exception, f: Log.error("problem serializing object", f)
def _tuple(template, data, fields, depth, output): deep_path = None deep_fields = DictList() for d in data: record = template for f in fields: index, children, record = _tuple_deep(d, f, depth, record) if index: path = f.value[0:index:] deep_fields.append(f) if deep_path and path != deep_path: Log.error("Dangerous to select into more than one branch at time") if not children: output.append(record) else: _tuple(record, children, deep_fields, depth + 1, output) return output
def __getitem__(self, index): offset = index - self.start if offset < 0: Log.error("Can not go in reverse on stream index=={{index}}", index=index) if self._mark == -1: while self.buffer_length <= offset: self.start += len(self.buffer) offset = index - self.start self.buffer = self.get_more() self.buffer_length = len(self.buffer) else: while self.buffer_length <= offset: self.buffer += self.get_more() self.buffer_length = len(self.buffer) return self.buffer[offset]
def process_unittest(source_key, etl_header, buildbot_summary, unittest_log, destination, please_stop=None): timer = Timer("Process log {{file}} for {{key}}", { "file": etl_header.name, "key": source_key }) try: with timer: summary = accumulate_logs(source_key, etl_header.name, unittest_log, please_stop) except Exception, e: Log.error("Problem processing {{key}}", key=source_key, cause=e) summary = None
def parse_partition(part): for p in part.partitions: if part.index: p.index = part.index # COPY INDEX DOWN parse_partition(p) p.value = coalesce(p.value, p.name) p.parent = part if not part.esfilter: if len(part.partitions) > 100: Log.error( "Must define an esfilter on {{name}} there are too many partitions ({{num_parts}})", name=part.name, num_parts=len(part.partitions)) # DEFAULT esfilter IS THE UNION OF ALL CHILD FILTERS if part.partitions: part.esfilter = {"or": part.partitions.esfilter}
def assertRaises(self, problem, function, *args, **kwargs): try: function(*args, **kwargs) except Exception, e: e = Except.wrap(e) if isinstance(problem, basestring): if problem in e: return Log.error( "expecting an exception returning {{problem|quote}} got something else instead", problem=problem, cause=e) elif not isinstance(e, problem): Log.error( "expecting an exception of type {{type}} to be raised", type=problem) else: return
def almost_equal(first, second, digits=None, places=None, delta=None): try: if first == second: return True if delta is not None: if abs(first - second) <= delta: return True else: places = coalesce(places, digits, 18) diff = math.log10(abs(first - second)) if diff < Math.ceiling(math.log10(first)) - places: return True return False except Exception, e: from pyLibrary.debugs.logs import Log Log.error("problem comparing", cause=e)
def floor(self, interval=None): if not isinstance(interval, Duration): from pyLibrary.debugs.logs import Log Log.error("Expecting an interval as a Duration object") output = Duration(0) if interval.month: if self.month: output.month = int(Math.floor(self.month / interval.month) * interval.month) output.milli = output.month * MILLI_VALUES.month return output # A MONTH OF DURATION IS BIGGER THAN A CANONICAL MONTH output.month = int(Math.floor(self.milli * 12 / MILLI_VALUES["year"] / interval.month) * interval.month) output.milli = output.month * MILLI_VALUES.month else: output.milli = Math.floor(self.milli / (interval.milli)) * (interval.milli) return output
def value2json(obj, pretty=False, sort_keys=False): try: json = json_encoder(obj, pretty=pretty) if json == None: Log.note(str(type(obj)) + " is not valid{{type}}JSON", type=" (pretty) " if pretty else " ") Log.error("Not valid JSON: " + str(obj) + " of type " + str(type(obj))) return json except Exception, e: e = Except.wrap(e) with suppress_exception: json = pypy_json_encode(obj) return json Log.error("Can not encode into JSON: {{value}}", value=repr(obj), cause=e)
def pipe2value(value): type = value[0] if type == '0': return None if type == 'n': return value2number(value[1::]) if type != 's' and type != 'a': Log.error("unknown pipe type ({{type}}) in {{value}}", type=type, value=value) # EXPECTING MOST STRINGS TO NOT HAVE ESCAPED CHARS output = _unPipe(value) if type == 's': return output return [pipe2value(v) for v in output.split("|")]
def getSelect(self, **kwargs): if self.fields: if len(self.fields) == 1: return Dict(name=self.full_name, value=self.fields[0], aggregate="none") else: return Dict(name=self.full_name, value=self.fields, aggregate="none") domain = self.getDomain(**kwargs) if not domain.getKey: Log.error("Should not happen") if not domain.NULL: Log.error("Should not happen") return Dict(name=self.full_name, domain=domain, aggregate="none")
def _iter(): g = 0 out = DictList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = DictList() if out: yield g, out except Exception, e: e = Except.wrap(e) if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside jx.groupby", e)
def _all_default(d, default, seen=None): """ ANY VALUE NOT SET WILL BE SET BY THE default THIS IS RECURSIVE """ if default is None: return if isinstance(default, Dict): default = object.__getattribute__(default, "_dict") # REACH IN AND GET THE dict # from pyLibrary.debugs.logs import Log # Log.error("strictly dict (or object) allowed: got {{type}}", type=default.__class__.__name__) for k, default_value in default.items(): default_value = unwrap( default_value ) # TWO DIFFERENT Dicts CAN SHARE id() BECAUSE THEY ARE SHORT LIVED existing_value = _get_attr(d, [k]) if existing_value == None: if default_value != None: if isinstance(default_value, Mapping): df = seen.get(id(default_value)) if df is not None: _set_attr(d, [k], df) else: copy_dict = {} seen[id(default_value)] = copy_dict _set_attr(d, [k], copy_dict) _all_default(copy_dict, default_value, seen) else: # ASSUME PRIMITIVE (OR LIST, WHICH WE DO NOT COPY) try: _set_attr(d, [k], default_value) except Exception, e: if PATH_NOT_FOUND not in e: from pyLibrary.debugs.logs import Log Log.error("Can not set attribute {{name}}", name=k, cause=e) elif isinstance(existing_value, list) or isinstance( default_value, list): _set_attr(d, [k], listwrap(existing_value) + listwrap(default_value))
def __init__( self, exchange, # name of the Pulse exchange topic, # message name pattern to subscribe to ('#' is wildcard) target=None, # WILL BE CALLED WITH PULSE PAYLOADS AND ack() IF COMPLETE$ED WITHOUT EXCEPTION target_queue=None, # (aka self.queue) WILL BE FILLED WITH PULSE PAYLOADS host='pulse.mozilla.org', # url to connect, port=5671, # tcp port user=None, password=None, vhost="/", start=0, # USED AS STARTING POINT FOR ASSIGNING THE _meta.count ATTRIBUTE ssl=True, applabel=None, heartbeat=False, # True to also get the Pulse heartbeat message durable=False, # True to keep queue after shutdown serializer='json', broker_timezone='GMT', settings=None): global count count = coalesce(start, 0) self.target_queue = target_queue self.pulse_target = target if (target_queue == None and target == None) or (target_queue != None and target != None): Log.error( "Expecting a queue (for fast digesters) or a target (for slow digesters)" ) Thread.__init__(self, name="Pulse consumer for " + settings.exchange, target=self._worker) self.settings = settings settings.callback = self._got_result settings.user = coalesce(settings.user, settings.username) settings.applabel = coalesce(settings.applable, settings.queue, settings.queue_name) settings.topic = topic self.pulse = ModifiedGenericConsumer(settings, connect=True, **settings) self.start()
def __init__( self, index, # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME type=None, # SCHEMA NAME, (DEFAULT TO TYPE IN INDEX, IF ONLY ONE) alias=None, explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED read_only=True, tjson=False, # STORED AS TYPED JSON timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) debug=False, # DO NOT SHOW THE DEBUG STATEMENTS settings=None ): if index==None: Log.error("not allowed") if index == alias: Log.error("must have a unique index name") self.cluster_state = None self.debug = debug self.settings = settings self.cluster = Cluster(settings) try: full_index = self.get_index(index) if full_index and alias==None: settings.alias = settings.index settings.index = full_index if full_index==None: Log.error("not allowed") if type == None: # NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT? with self.cluster.metadata_locker: index_ = self.cluster._metadata.indices[self.settings.index] if not index_: indices = self.cluster.get_metadata().indices index_ = indices[self.settings.index] candidate_types = list(index_.mappings.keys()) if len(candidate_types) != 1: Log.error("Expecting `type` parameter") self.settings.type = type = candidate_types[0] except Exception, e: # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER Log.error("not expected", cause=e)
def post_json(url, **kwargs): """ ASSUME RESPONSE IN IN JSON """ if b"json" in kwargs: kwargs[b"data"] = convert.unicode2utf8( convert.value2json(kwargs[b"json"])) elif b'data': kwargs[b"data"] = convert.unicode2utf8( convert.value2json(kwargs[b"data"])) else: Log.error("Expecting `json` parameter") response = post(url, **kwargs) c = response.content try: details = convert.json2value(convert.utf82unicode(c)) except Exception, e: Log.error("Unexpected return value {{content}}", content=c, cause=e)
def value2url(value): if value == None: Log.error("") if isinstance(value, Mapping): output = "&".join([ value2url(k) + "=" + (value2url(v) if isinstance(v, basestring) else value2url(value2json(v))) for k, v in value.items() ]) elif isinstance(value, unicode): output = "".join([_map2url[c] for c in unicode2latin1(value)]) elif isinstance(value, str): output = "".join([_map2url[c] for c in value]) elif hasattr(value, "__iter__"): output = ",".join(value2url(v) for v in value) else: output = unicode(value) return output
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) # GET IDS OF DOCUMENTS results = self._es.search({ "fields": [], "query": { "filtered": { "query": { "match_all": {} }, "filter": _normalize_where(command.where, self) } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") scripts.append("ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v) + ";\n") script = "".join(scripts) if results.hits.hits: command = [] for id in results.hits.hits._id: command.append({"update": {"_id": id}}) command.append({"script": script}) content = ("\n".join(convert.value2json(c) for c in command) + "\n").encode('utf-8') self._es.cluster._post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"})
def assign(source, destination): for i, f in enumerate(prefix): source = source.get(f) if source is None: return 0, None if isinstance(source, list): return depth + i + 1, source f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE destination[name] = source else: destination[name] = source.get(f) except Exception, e: Log.error("{{value}} does not have {{field}} property", value=source, field=f, cause=e)
def unique_index(data, keys=None, fail_on_dup=True): """ RETURN dict THAT USES KEYS TO INDEX DATA ONLY ONE VALUE ALLOWED PER UNIQUE KEY """ o = UniqueIndex(listwrap(keys), fail_on_dup=fail_on_dup) for d in data: try: o.add(d) except Exception, e: o.add(d) Log.error( "index {{index}} is not unique {{key}} maps to both {{value1}} and {{value2}}", index=keys, key=select([d], keys)[0], value1=o[d], value2=d, cause=e)
def query(self, q): frum = self if is_aggs(q): frum = list_aggs(frum.data, q) else: # SETOP try: if q.filter != None or q.esfilter != None: Log.error("use 'where' clause") except AttributeError, e: pass if q.where is not TRUE_FILTER and not isinstance(q.where, TrueOp): frum = frum.filter(q.where) if q.sort: frum = frum.sort(q.sort) if q.select: frum = frum.select(q.select)
def delete(self, filter): self.cluster.get_metadata() if self.cluster.cluster_state.version.number.startswith("0.90"): query = {"filtered": { "query": {"match_all": {}}, "filter": filter }} elif self.cluster.cluster_state.version.number.startswith("1."): query = {"query": {"filtered": { "query": {"match_all": {}}, "filter": filter }}} else: raise NotImplementedError if self.debug: Log.note("Delete bugs:\n{{query}}", query= query) keep_trying = True while keep_trying: result = self.cluster.delete( self.path + "/_query", data=convert.value2json(query), timeout=60 ) keep_trying = False for name, status in result._indices.items(): if status._shards.failed > 0: if status._shards.failures[0].reason.find("rejected execution (queue capacity ") >= 0: keep_trying = True Thread.sleep(seconds=5) break if not keep_trying: for name, status in result._indices.items(): if status._shards.failed > 0: Log.error( "ES shard(s) report Failure to delete from {{index}}: {{message}}. Query was {{query}}", index=name, query=query, message=status._shards.failures[0].reason )
def add(self, val): val = dictwrap(val) key = value2key(self._keys, val) if key == None: Log.error("Expecting key to be not None") d = self._data.get(key) if d is None: self._data[key] = unwrap(val) self.count += 1 elif d is not val: if self.fail_on_dup: Log.error("key {{key|json}} already filled", key=key) else: Log.warning( "key {{key|json}} already filled\nExisting\n{{existing|json|indent}}\nValue\n{{value|json|indent}}", key=key, existing=d, value=val)
def _all_lines(self, encoding="utf8"): try: iterator = self.raw.stream(4096, decode_content=False) if self.headers.get('content-encoding') == 'gzip': return ibytes2ilines(icompressed2ibytes(iterator), encoding=encoding) elif self.headers.get('content-type') == 'application/zip': return ibytes2ilines(icompressed2ibytes(iterator), encoding=encoding) elif self.url.endswith(".gz"): return ibytes2ilines(icompressed2ibytes(iterator), encoding=encoding) else: return ibytes2ilines(iterator, encoding=encoding, closer=self.close) except Exception, e: Log.error("Can not read content", cause=e)
def index(data, keys=None): # return dict that uses keys to index data o = Index(keys) if isinstance(data, Cube): if data.edges[0].name == keys[0]: #QUICK PATH names = list(data.data.keys()) for d in (set_default(dot.zip(names, r), {keys[0]: p}) for r, p in zip(zip(*data.data.values()), data.edges[0].domain.partitions.value)): o.add(d) return o else: Log.error("Can not handle indexing cubes at this time") for d in data: o.add(d) return o
def parse_time_expression(value): def simple_date(sign, dig, type, floor): if dig or sign: from pyLibrary.debugs.logs import Log Log.error("can not accept a multiplier on a datetime") if floor: return Date(type).floor(Duration(floor)) else: return Date(type) terms = re.match(r'(\d*[|\w]+)\s*([+-]\s*\d*[|\w]+)*', value).groups() sign, dig, type = re.match(r'([+-]?)\s*(\d*)([|\w]+)', terms[0]).groups() if "|" in type: type, floor = type.split("|") else: floor = None if type in MILLI_VALUES.keys(): value = Duration(dig+type) else: value = simple_date(sign, dig, type, floor) for term in terms[1:]: if not term: continue sign, dig, type = re.match(r'([+-])\s*(\d*)([|\w]+)', term).groups() if "|" in type: type, floor = type.split("|") else: floor = None op = {"+": "__add__", "-": "__sub__"}[sign] if type in MILLI_VALUES.keys(): if floor: from pyLibrary.debugs.logs import Log Log.error("floor (|) of duration not accepted") value = value.__getattribute__(op)(Duration(dig+type)) else: value = value.__getattribute__(op)(simple_date(sign, dig, type, floor)) return value