def go(self): """ ACTIVATE SIGNAL (DOES NOTHING IF SIGNAL IS ALREADY ACTIVATED) """ DEBUG and self._name and Log.note("GO! {{name|quote}}", name=self.name) if self._go: return with self.lock: if self._go: return self._go = True DEBUG and self._name and Log.note("internal GO! {{name|quote}}", name=self.name) jobs, self.job_queue = self.job_queue, None threads, self.waiting_threads = self.waiting_threads, None if threads: DEBUG and self._name and Log.note("Release {{num}} threads", num=len(threads)) for t in threads: t.release() if jobs: for j in jobs: try: j() except Exception as e: Log.warning("Trigger on Signal.go() failed!", cause=e)
def __exit__(self, a, b, c): if self.waiting: if DEBUG: _Log.note("signaling {{num}} waiters", num=len(self.waiting)) waiter = self.waiting.pop() waiter.go() self.lock.release()
def _reader(self, name, pipe, recieve, please_stop): try: line = "dummy" while not please_stop and self.service.returncode is None and line: line = pipe.readline().rstrip() if line: recieve.add(line) if self.debug: Log.note("{{process}} ({{name}}): {{line}}", name=name, process=self.name, line=line) # GRAB A FEW MORE LINES max = 100 while max: try: line = pipe.readline().rstrip() if line: max = 100 recieve.add(line) if self.debug: Log.note("{{process}} ({{name}}): {{line}}", name=name, process=self.name, line=line) else: max -= 1 except Exception: break finally: pipe.close() recieve.add(THREAD_STOP)
def __enter__(self): Log.note("SingleInstance.lockfile = " + self.lockfile) if sys.platform == 'win32': try: # file already exists, we try to remove (in case previous execution was interrupted) if os.path.exists(self.lockfile): os.unlink(self.lockfile) self.fd = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR) except Exception as e: Log.alarm("Another instance is already running, quitting.") sys.exit(-1) else: # non Windows import fcntl self.fp = open(self.lockfile, 'w') try: fcntl.lockf(self.fp, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: Log.note( "\n" "**********************************************************************\n" "** Another instance is already running, quitting.\n" "**********************************************************************\n" ) sys.exit(-1) self.initialized = True
def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical)
def disconnect(): with suppress_exception: self.target_queue.close() Log.note("stop put into queue") self.pulse.disconnect() Log.note("pulse listener was given a disconnect()")
def column_query(self, sql, param=None): """ RETURN RESULTS IN [column][row_num] GRID """ self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() self.cursor.execute("SET TIME_ZONE='+00:00'") self.cursor.close() self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) grid = [[utf8_to_unicode(c) for c in row] for row in self.cursor] # columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] result = zip(*grid) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None return result except Exception as e: if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e,stack_depth=1)
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now()
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
def forall(self, sql, param=None, _execute=None): assert _execute num = 0 self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql= indent(sql)) self.cursor.execute(sql) columns = tuple([utf8_to_unicode(d[0]) for d in self.cursor.description]) for r in self.cursor: num += 1 _execute(wrap(dict(zip(columns, [utf8_to_unicode(c) for c in r])))) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1) return num
def delete_record(self, filter): if self.settings.read_only: Log.error("Index opened in read only mode, no changes allowed") self.cluster.get_metadata() if self.cluster.cluster_state.version.number.startswith("0.90"): query = {"filtered": { "query": {"match_all": {}}, "filter": filter }} elif self.cluster.cluster_state.version.number.startswith("1."): query = {"query": {"filtered": { "query": {"match_all": {}}, "filter": filter }}} else: raise NotImplementedError if self.debug: Log.note("Delete bugs:\n{{query}}", query= query) result = self.cluster.delete( self.path + "/_query", data=convert.value2json(query), timeout=600, params={"consistency": self.settings.consistency} ) for name, status in result._indices.items(): if status._shards.failed > 0: Log.error("Failure to delete from {{index}}", index=name)
def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def _execute_backlog(self): if not self.backlog: return (backlog, self.backlog) = (self.backlog, []) if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(b) except Exception as e: Log.error("Can not execute sql:\n{{sql}}", sql= sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def write_profiles(main_thread_profile): if cprofiler_stats is None: return from pyLibrary import convert from mo_files import File cprofiler_stats.add(pstats.Stats(main_thread_profile.cprofiler)) stats = cprofiler_stats.pop_all() Log.note("aggregating {{num}} profile stats", num=len(stats)) acc = stats[0] for s in stats[1:]: acc.add(s) stats = [ { "num_calls": d[1], "self_time": d[2], "total_time": d[3], "self_time_per_call": d[2] / d[1], "total_time_per_call": d[3] / d[1], "file": (f[0] if f[0] != "~" else "").replace("\\", "/"), "line": f[1], "method": f[2].lstrip("<").rstrip(">") } for f, d, in iteritems(acc.stats) ] stats_file = File(FILENAME, suffix=convert.datetime2string(datetime.now(), "_%Y%m%d_%H%M%S")) stats_file.write(convert.list2tab(stats)) Log.note("profile written to {{filename}}", filename=stats_file.abspath)
def open_test_instance(name, filename=None, es=None, kwargs=None): if filename != None: Log.note( "Using {{filename}} as {{type}}", filename=filename, type=name ) return FakeES(filename=filename) else: Log.note( "Using ES cluster at {{host}} as {{type}}", host=es.host, type=name ) cluster = Cluster(es) try: old_index = cluster.get_index(es) cluster.delete_index(old_index.settings.index) except Exception as e: if "Can not find index" not in e: Log.error("unexpected", cause=e) output = cluster.create_index(limit_replicas=True, limit_replicas_warning=False, kwargs=es) output.delete_all_but_self() output.add_alias(es.index) return output
def delete_index(self, index_name): if not isinstance(index_name, unicode): Log.error("expecting an index name") if self.debug: Log.note("Deleting index {{index}}", index=index_name) # REMOVE ALL ALIASES TOO aliases = [a for a in self.get_aliases() if a.index == index_name and a.alias != None] if aliases: self.post( path="/_aliases", data={"actions": [{"remove": a} for a in aliases]} ) url = self.settings.host + ":" + unicode(self.settings.port) + "/" + index_name try: response = http.delete(url) if response.status_code != 200: Log.error("Expecting a 200, got {{code}}", code=response.status_code) details = mo_json.json2value(utf82unicode(response.content)) if self.debug: Log.note("delete response {{response}}", response=details) return response except Exception as e: Log.error("Problem with call to {{url}}", url=url, cause=e)
def _got_result(self, data, message): global count data = wrap(data) with count_locker: Log.note("{{count}} from {{exchange}}", count=count, exchange=self.pulse.exchange) data._meta.count = count data._meta.exchange = self.pulse.exchange count += 1 if self.settings.debug: Log.note("{{data}}", data=data) if self.target_queue != None: try: self.target_queue.add(data) message.ack() except Exception as e: e = Except.wrap(e) if not self.target_queue.closed: # EXPECTED TO HAPPEN, THIS THREAD MAY HAVE BEEN AWAY FOR A WHILE raise e else: try: self.pulse_target(data) message.ack() except Exception as e: Log.warning("Problem processing pulse (see `data` in structured log)", data=data, cause=e)
def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None pending = [] # FOR WHEN WE DO NOT HAVE QUEUE YET for key in keys: timer = Timer("Process {{key}}", param={"key": key}) try: with timer: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size) num_keys += 1 if queue == None: queue = self._get_queue(row) if queue == None: pending.append(row) if len(pending) > 1000: self._get_queue(row) Log.error("first 1000 (key={{key}}) records have no indication what index to put data", key=tuple(keys)[0]) continue elif queue is DATA_TOO_OLD: break if pending: queue.extend(pending) pending = [] queue.add(row) if please_stop: break except Exception as e: done_copy = None Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e) if done_copy: if queue == None: done_copy() else: queue.add(done_copy) if pending: Log.error("Did not find an index to place the data for key={{key}}", key=tuple(keys)[0]) Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys) return num_keys
def __init__(self, name=None): if DEBUG: Log.note("New signal {{name|quote}}", name=name) self._name = name self.lock = _allocate_lock() self._go = False self.job_queue = None self.waiting_threads = None
def add(self, doc, queue=None): if queue == None: queue = self._get_queue(doc) if queue == None: Log.note("Document not added: Too old") return queue.add(doc)
def flush(self): try: self.cluster.post("/" + self.settings.index + "/_flush", data={"wait_if_ongoing": True, "forced": False}) except Exception as e: if "FlushNotAllowedEngineException" in e: Log.note("Flush is ignored") else: Log.error("Problem flushing", cause=e)
def print_proc_memory_used(self, loc): self.prev_mem = self.curr_mem self.curr_mem = self.get_process_memory() Log.note( "Used memory since last call: {{mem}}", loc=loc, mem=self.curr_mem - self.prev_mem )
def create_index( self, index, alias=None, create_timestamp=None, schema=None, limit_replicas=None, read_only=False, tjson=False, kwargs=None ): if not alias: alias = kwargs.alias = kwargs.index index = kwargs.index = proto_name(alias, create_timestamp) if kwargs.alias == index: Log.error("Expecting index name to conform to pattern") if kwargs.schema_file: Log.error('schema_file attribute not supported. Use {"$ref":<filename>} instead') if schema == None: Log.error("Expecting a schema") elif isinstance(schema, basestring): schema = mo_json.json2value(schema, leaves=True) else: schema = mo_json.json2value(convert.value2json(schema), leaves=True) if limit_replicas: # DO NOT ASK FOR TOO MANY REPLICAS health = self.get("/_cluster/health") if schema.settings.index.number_of_replicas >= health.number_of_nodes: Log.warning("Reduced number of replicas: {{from}} requested, {{to}} realized", {"from": schema.settings.index.number_of_replicas}, to= health.number_of_nodes - 1 ) schema.settings.index.number_of_replicas = health.number_of_nodes - 1 self.post( "/" + index, data=schema, headers={"Content-Type": "application/json"} ) # CONFIRM INDEX EXISTS while True: try: state = self.get("/_cluster/state", retry={"times": 5}, timeout=3) if index in state.metadata.indices: break Log.note("Waiting for index {{index}} to appear", index=index) except Exception as e: Log.warning("Problem while waiting for index {{index}} to appear", index=index, cause=e) Till(seconds=1).wait() Log.alert("Made new index {{index|quote}}", index=index) es = Index(kwargs=kwargs) return es
def __exit__(self, type, value, traceback): self.end = time() self.interval = self.end - self.start if self.debug: param = wrap(self.param) param.duration = timedelta(seconds=self.interval) if not self.silent: Log.note("Timer end : " + self.template + " (took {{duration}})", self.param, stack_depth=1)
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if isinstance(node, Mapping): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = mo_dots.get_attr(new_value, ref.fragment) DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif isinstance(output, text_type): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif isinstance(node, list): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def _stderr(self, please_stop): while not please_stop: try: line = self.process.stderr.pop(till=please_stop) if line == THREAD_STOP: please_stop.go() break Log.note("Error line from {{name}}({{pid}}): {{line}}", line=line, name=self.process.name, pid=self.process.pid) except Exception as e: Log.error("could not process line", cause=e)
def _daemon(self, please_stop): while not please_stop: timestamp = Date.now() with self.lock: decay = METRIC_DECAY_RATE ** (timestamp - self.last_request).seconds request_rate = self.request_rate = decay * self.request_rate self.last_request = timestamp Log.note("{{name}} request rate: {{rate|round(places=2)}} requests per second", name=self.name, rate=request_rate) (please_stop | Till(seconds=METRIC_REPORT_PERIOD.seconds)).wait()
def __exit__(self, type, value, traceback): self.end = time() self.interval = self.end - self.start self.agg += self.interval self.param.duration = timedelta(seconds=self.interval) if not self.silent: if self.too_long == 0: Log.note("Timer end : " + self.template + " (took {{duration}})", default_params=self.param, stack_depth=1) elif self.interval >= self.too_long: Log.note("Time too long: " + self.template + " ({{duration}})", default_params=self.param, stack_depth=1)
def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
def add_version(flask_app): """ ADD ROUTING TO HANDLE REQUEST FOR /__version__ :param flask_app: THE (Flask) APP :return: """ try: rev = coalesce(git.get_revision(), "") branch = "https://github.com/mozilla/ActiveData/tree/" + coalesce(git.get_branch()) version_info = value2json( { "source": "https://github.com/mozilla/ActiveData/tree/" + rev, "branch": branch, "commit": rev, }, pretty=True, ).encode('utf8') + text("\n") Log.note("Using github version\n{{version}}", version=version_info) @register_thread @cors_wrapper def version(): return Response( version_info, status=200, headers={"Content-Type": mimetype.JSON} ) flask_app.add_url_rule( str("/__version__"), None, version, defaults={}, methods=[str("GET"), str("POST")], ) except Exception as e: Log.error("Problem setting up listeners for dockerflow", cause=e)
def make_database(num, please_stop): db = sqlite3.connect( database=":memory:" if num == 1 else "this is a test file" + text(num) + ".sqlite", check_same_thread=False, isolation_level=None, ) db.execute("DROP TABLE IF EXISTS digits" + text(num)) db.execute("DROP TABLE IF EXISTS thousand" + text(num)) db.execute("DROP TABLE IF EXISTS million" + text(num)) db.execute("DROP TABLE IF EXISTS billion" + text(num)) db.execute("CREATE TABLE digits" + text(num) + " (value TEXT)") Log.note("ten") db.execute( "INSERT INTO digits" + text(num) + " VALUES ('1'), ('2'), ('3'), ('4'), ('5'), ('6'), ('7'), ('8'), ('9')" ) db.commit() Log.note("thousand") db.execute( "CREATE TABLE thousand" + text(num) + " AS SELECT d1.value || d2.value || d3.value AS value FROM digits" + text(num) + " d1, digits" + text(num) + " d2, digits" + text(num) + " d3") Log.note("million") db.execute( "CREATE TABLE million" + text(num) + " AS SELECT d1.value || d2.value AS value FROM thousand" + text(num) + " d1, thousand" + text(num) + " d2") Log.note("billion") db.execute( "CREATE TABLE billion" + text(num) + " AS SELECT d1.value || d2.value || d3.value AS value FROM thousand" + text(num) + " d1, thousand" + text(num) + " d2, thousand" + text(num) + " d3") Log.note("done")
def safe_size(source): """ READ THE source UP TO SOME LIMIT, THEN COPY TO A FILE IF TOO BIG RETURN A str() OR A FileString() """ if source is None: return None total_bytes = 0 bytes = [] b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) bytes.append(b) if total_bytes > MAX_STRING_SIZE: try: data = FileString(TemporaryFile()) for bb in bytes: data.write(bb) del bytes del bb b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) data.write(b) b = source.read(MIN_READ_SIZE) data.seek(0) Log.note("Using file of size {{length}} instead of str()", length=total_bytes) return data except Exception, e: Log.error("Could not write file > {{num}} bytes", num=total_bytes, cause=e) b = source.read(MIN_READ_SIZE)
def open_test_instance(name, filename=None, es=None, kwargs=None): if filename != None: Log.note("Using {{filename}} as {{type}}", filename=filename, type=name) return FakeES(filename=filename) else: Log.note("Using ES cluster at {{host}} as {{type}}", host=es.host, type=name) cluster = Cluster(es) try: old_index = cluster.get_index(es) cluster.delete_index(old_index.settings.index) except Exception as e: if "Can not find index" not in e: Log.error("unexpected", cause=e) output = cluster.create_index(limit_replicas=True, limit_replicas_warning=False, kwargs=es) output.delete_all_but_self() output.add_alias(es.index) return output
def commit(self): with self.lock: if self.closed: Log.error("Queue is closed, commit not allowed") try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) if self.db.status.end - self.start < 10 or Random.range(0, 1000) == 0: # FORCE RE-WRITE TO LIMIT FILE SIZE # SIMPLY RE-WRITE FILE if DEBUG: Log.note("Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start) for k in self.db.keys(): if k == "status" or int(k) >= self.db.status.start: continue Log.error("Not expecting {{key}}", key=k) self._commit() self.file.write(mo_json.value2json({"add": self.db}) + "\n") else: self._commit() except Exception as e: raise e
def test_multiple_agg_on_same_field(self): if self.not_real_service(): return test = wrap({"query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [ { "name": "max_bytes", "value": "run.stats.bytes", "aggregate": "max" }, { "name": "count", "value": "run.stats.bytes", "aggregate": "count" } ] }}) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", filename=settings.filename, type=name) return FakeES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", host=settings.host, type=name) cluster = Cluster(settings) try: old_index = cluster.get_index(kwargs=settings) old_index.delete() except Exception as e: if "Can not find index" not in e: Log.error("unexpected", cause=e) es = cluster.create_index(limit_replicas=True, limit_replicas_warning=False, kwargs=settings) es.delete_all_but_self() es.add_alias(settings.index) return es
def add(self, value): with self.lock: if self.closed: Log.error("Queue is closed") if value is THREAD_STOP: DEBUG and Log.note("Stop is seen in persistent queue") self.please_stop.go() return self._add_pending({"add": {str(self.db.status.end): value}}) self.db.status.end += 1 self._add_pending({"add": {"status.end": self.db.status.end}}) self._commit() return self
def value2json(obj, pretty=False, sort_keys=False, keep_whitespace=True): """ :param obj: THE VALUE TO TURN INTO JSON :param pretty: True TO MAKE A MULTI-LINE PRETTY VERSION :param sort_keys: True TO SORT KEYS :param keep_whitespace: False TO strip() THE WHITESPACE IN THE VALUES :return: """ if FIND_LOOPS: obj = scrub(obj, scrub_text=_keep_whitespace if keep_whitespace else trim_whitespace()) try: json = json_encoder(obj, pretty=pretty) if json == None: Log.note(str(type(obj)) + " is not valid{{type}}JSON", type=" (pretty) " if pretty else " ") Log.error("Not valid JSON: " + str(obj) + " of type " + str(type(obj))) return json except Exception as e: e = Except.wrap(e) try: json = pypy_json_encode(obj) return json except Exception: pass Log.error("Can not encode into JSON: {{value}}", value=text(repr(obj)), cause=e)
def download_perfherder(desc, repo, id, dummy, framework): sig_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/signatures/?format=json&framework=" + str(framework) + "&id=" + str(id)) signature = first(sig_result.keys()) data_result = http.get_json("https://treeherder.mozilla.org/api/project/" + repo + "/performance/data/?signatures=" + signature) Log.note( "{{result|json}}", result={ "name": desc, "data": jx.run({ "from": ListContainer("data", data_result[signature]), "sort": "push_timestamp", "select": "value" }).data }, )
def test_wrap_1(): switch = [ lambda: Data(i=Random.int(2000)), lambda: { "i": Random.int(2000) }, lambda: FlatList([{ "i": Random.int(2000) }]), lambda: [{ "i": Random.int(2000) }] ] inputs = [ switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT) ] for i in range(NUM_REPEAT): results = [] gc.collect() with Profiler("more struct: slow_wrap"): for v in inputs: results.append(slow_wrap(v)) results = [] gc.collect() with Profiler("more struct: wrap"): for v in inputs: results.append(wrap(v)) results = [] gc.collect() with Profiler("more struct: baseline"): for v in inputs: results.append(baseline(v)) Log.note("Done {{i}} of {{num}}", {"i": i, "num": NUM_REPEAT})
def send_queries(self, subtest, places=6): subtest = wrap(subtest) try: # EXECUTE QUERY num_expectations = 0 for k, v in subtest.items(): if k.startswith("expecting_"): # WHAT FORMAT ARE WE REQUESTING format = k[len("expecting_"):] elif k == "expecting": # NO FORMAT REQUESTED (TO TEST DEFAULT FORMATS) format = None else: continue num_expectations += 1 expected = v subtest.query.format = format subtest.query.meta.testing = True # MARK ALL QUERIES FOR TESTING SO FULL METADATA IS AVAILABLE BEFORE QUERY EXECUTION query = convert.unicode2utf8(convert.value2json(subtest.query)) # EXECUTE QUERY response = self.try_till_response(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) # HOW TO COMPARE THE OUT-OF-ORDER DATA? compare_to_expected(subtest.query, result, expected, places) Log.note("Test result compares well") if num_expectations == 0: Log.error("Expecting test {{name|quote}} to have property named 'expecting_*' for testing the various format clauses", { "name": subtest.name }) except Exception, e: Log.error("Failed test {{name|quote}}", {"name": subtest.name}, e)
def test_time_subtraction(self): """ IF THIS FAILS, MAYBE THE JSON ROUNDING ENGINE IS ENABLED """ today = Date("2018-10-20") Log.note("Notice {{date}} is {{unix}} and rounding will affect that number", date=today, unix=today.unix) data = [ {"a": today, "t": Date("2018-10-20").unix, "v": 2}, {"a": today, "t": Date("2018-10-19").unix, "v": 2}, {"a": today, "t": Date("2018-10-18").unix, "v": 3}, {"a": today, "t": Date("2018-10-17").unix, "v": 5}, {"a": today, "t": Date("2018-10-16").unix, "v": 7}, {"a": today, "t": Date("2018-10-15").unix, "v": 11}, {"a": today, "t": None, "v": 27}, {"a": today, "t": Date("2018-10-19").unix, "v": 13}, {"a": today, "t": Date("2018-10-18").unix, "v": 17}, {"a": today, "t": Date("2018-10-16").unix, "v": 19}, {"a": today, "t": Date("2018-10-15").unix, "v": 23} ] test = { "data": data, "query": { "from": TEST_TABLE, "select": ["a", "t", "v", {"name": "diff", "value": {"sub": ["t", "a"]}}], "limit": 100 }, "expecting_list": { "meta": {"format": "list"}, "data": [ {"a": Date(r.a).unix, "t": Date(r.t).unix, "v":r.v, "diff": (Date(r.t) - Date(r.a)).seconds} for r in to_data(data) ] } } self.utils.execute_tests(test)
def test_failures_by_directory(self): if self.not_real_service(): return test = wrap({ "query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [{ "aggregate": "count" }], "edges": ["result.test", "result.ok"], "where": { "prefix": { "result.test": "/" } }, "format": "table" } }) query = unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.testing.query, data=query) if response.status_code != 200: error(response) result = json2value(utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def test_chunk_timing(self): if self.not_real_service(): return test = wrap({"query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": {"value": "run.stats.duration", "aggregate": "average"}, "edges": [ {"name": "chunk", "value": ["run.suite", "run.chunk"]} ], "where": {"and": [ {"term": {"etl.id": 0}}, {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}} ]}, "format": "cube", "samples": { "limit": 30 } }}) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = self.utils.try_till_response(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def loop(is_ready, please_stop): with locker: while not got_signal: locker.wait(till=Till(seconds=0.01)) is_ready.go() Log.note("{{thread}} is ready", thread=Thread.current().name) Log.note("outside loop") locker.wait() Log.note("thread is expected to get here")
def _worker(self, please_stop): try: # MAIN EXECUTION LOOP while not please_stop: command_item = self.queue.pop(till=please_stop) if command_item is None: break self._process_command_item(command_item) except Exception as e: e = Except.wrap(e) if not please_stop: Log.warning("Problem with sql", cause=e) finally: self.closed = True DEBUG and Log.note("Database is closed") self.db.close()
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN A LIST OF dicts :param sql: SQL TEMPLATE TO SEND :param param: PARAMETERS TO INJECT INTO SQL TEMPLATE :param stream: STREAM OUTPUT :param row_tuples: DO NOT RETURN dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if isinstance(sql, SQL): sql = text(sql) if param: sql = expand_template(sql, quote_param(param)) sql = self.preamble + outdent(sql) self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = tuple(utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])) def streamer(): for row in self.cursor: output = Data() for c, v in zip(columns, row): output[c] = v yield output if stream: result = streamer() else: result = wrap(streamer()) return result except Exception as e: e = Except.wrap(e) if "InterfaceError" in e: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ try: m = meta.singlton now = Date.now() end_time = now + MINUTE # MARK COLUMNS DIRTY m.meta.columns.update({ "clear": ["partitions", "count", "cardinality", "last_updated"], "where": { "eq": { "es_index": join_field(split_field(query["from"])[0:1]) } } }) # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION cols = [ c for c in m.get_columns(table_name=query["from"], force=True) if c.type not in STRUCT ] for c in cols: Log.note("Mark {{column}} dirty at {{time}}", column=c.names["."], time=now) c.last_updated = now - TOO_OLD m.todo.push(c) while end_time > now: # GET FRESH VERSIONS cols = [ c for c in m.get_columns(table_name=query["from"]) if c.type not in STRUCT ] for c in cols: if not c.last_updated or c.cardinality == None: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}) metadata to arrive", col=c) break else: break Till(seconds=1).wait() for c in cols: Log.note( "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}", column=c) except Exception, e: Log.warning("could not pickup columns", cause=e)
def _execute_backlog(self): if not self.backlog: return backlog, self.backlog = self.backlog, [] for i, g in jx.chunk(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: self.debug and Log.note( "Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def inner(changeset_id): # ALWAYS TRY ES FIRST moves = _get_changeset_from_es(self.moves, changeset_id).changeset.moves if moves: return moves url = URL(revision.branch.url) / "raw-rev" / changeset_id DEBUG and Log.note("get unified diff from {{url}}", url=url) try: # THE ENCODING DOES NOT MATTER BECAUSE WE ONLY USE THE '+', '-' PREFIXES IN THE DIFF moves = http.get(url).content.decode("latin1") return diff_to_moves(text(moves)) except Exception as e: Log.warning("could not get unified diff from {{url}}", url=url, cause=e)
def test_backfilling_by_count(clogger): clogger.disable_backfilling = False clogger.disable_tipfilling = True clogger.disable_deletion = True clogger.disable_maintenance = True num_trys = 50 wait_time = 2 num_to_go_back = 10 oldest_revnum, oldest_rev = clogger.conn.get_one( "SELECT min(revnum) AS revnum, revision FROM csetLog") new_old_rev = None clog_url = HG_URL + clogger.config.hg.branch + '/' + 'json-log/' + oldest_rev clog_obj_list = list(clogger._get_clog(clog_url)['changesets']) for count, clog_obj in enumerate(clog_obj_list[1:]): if count >= num_to_go_back - 1: new_old_rev = clog_obj['node'][:12] break clogger.csets_todo_backwards.add((num_to_go_back, True)) new_ending = None new_revnum = None while num_trys > 0: with clogger.conn.transaction() as t: new_ending = t.get_one( "SELECT min(revnum) AS revnum, revision FROM csetLog")[1] DEBUG and Log.note("{{data}}", data=(oldest_rev, new_old_rev, new_ending)) if new_ending == new_old_rev: new_revnum = t.get_one( "SELECT revnum FROM csetLog WHERE revision=?", (oldest_rev, ))[0] break if new_ending != new_old_rev: Till(seconds=wait_time).wait() num_trys -= 1 assert num_trys > 0 assert new_old_rev == new_ending # Check that revnum's were properly handled expected_revnum = oldest_revnum + num_to_go_back assert expected_revnum == new_revnum
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN LIST OF dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if param: sql = expand_template(sql, quote_param(param)) sql = self.preamble + outdent(sql) self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = [ utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, []) ] if stream: result = (wrap( {c: utf8_to_unicode(v) for c, v in zip(columns, row)}) for row in self.cursor) else: result = wrap( [{c: utf8_to_unicode(v) for c, v in zip(columns, row)} for row in self.cursor]) return result except Exception as e: e = Except.wrap(e) if "InterfaceError" in e: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _get_file(ref, url): if ref.path.startswith("~"): home_path = os.path.expanduser("~") if os.sep == "\\": home_path = "/" + home_path.replace(os.sep, "/") if home_path.endswith("/"): home_path = home_path[:-1] ref.path = home_path + ref.path[1::] elif not ref.path.startswith("/"): # CONVERT RELATIVE TO ABSOLUTE if ref.path[0] == ".": num_dot = 1 while ref.path[num_dot] == ".": num_dot += 1 parent = url.path.rstrip("/").split("/")[:-num_dot] ref.path = "/".join(parent) + ref.path[num_dot:] else: parent = url.path.rstrip("/").split("/")[:-1] ref.path = "/".join(parent) + "/" + ref.path path = ref.path if os.sep != "\\" else ref.path[1::].replace("/", "\\") try: DEBUG and Log.note("reading file {{path}}", path=path) content = File(path).read() except Exception as e: content = None Log.error("Could not read file {{filename}}", filename=path, cause=e) try: new_value = json2value(content, params=ref.query, flexible=True, leaves=True) except Exception as e: e = Except.wrap(e) try: new_value = ini2value(content) except Exception: raise Log.error("Can not read {{file}}", file=path, cause=e) new_value = _replace_ref(new_value, ref) return new_value
def get_session(self, session_id): now = Date.now().unix result = self.db.query( sql_query({ "from": self.table, "where": { "eq": { "session_id": session_id } } })) saved_record = first(Data(zip(result.header, r)) for r in result.data) if not saved_record or saved_record.expires <= now: return Data() session = json2value(saved_record.data) DEBUG and Log.note("record from db {{session}}", session=saved_record) return session
def icompressed2ibytes(source): """ :param source: GENERATOR OF COMPRESSED BYTES :return: GENERATOR OF BYTES """ decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS) last_bytes_count = 0 # Track the last byte count, so we do not show too many debug lines bytes_count = 0 for bytes_ in source: data = decompressor.decompress(bytes_) bytes_count += len(data) if mo_math.floor(last_bytes_count, 1000000) != mo_math.floor( bytes_count, 1000000): last_bytes_count = bytes_count DEBUG and Log.note("bytes={{bytes}}", bytes=bytes_count) yield data
def _setup_etl_code(self): sudo("apt-get install -y python2.7") Log.note("5") if not fabric_files.exists("/usr/local/bin/pip"): run("mkdir -p /home/ubuntu/temp") with cd("/home/ubuntu/temp"): # INSTALL FROM CLEAN DIRECTORY run("wget https://bootstrap.pypa.io/get-pip.py") sudo("rm -fr ~/.cache/pip" ) # JUST IN CASE THE DIRECTORY WAS MADE sudo("python2.7 get-pip.py") Log.note("6") if not fabric_files.exists("/home/ubuntu/ActiveData-ETL/README.md"): with cd("/home/ubuntu"): sudo("apt-get -yf install git-core") run('rm -fr /home/ubuntu/ActiveData-ETL') run("git clone https://github.com/klahnakoski/ActiveData-ETL.git" ) run("mkdir -p /home/ubuntu/ActiveData-ETL/results/logs") Log.note("7") with cd("/home/ubuntu/ActiveData-ETL"): run("git checkout etl") # pip install -r requirements.txt HAS TROUBLE IMPORTING SOME LIBS sudo("rm -fr ~/.cache/pip") # JUST IN CASE THE DIRECTORY WAS MADE sudo("pip install future") sudo("pip install BeautifulSoup") sudo("pip install MozillaPulse") sudo("pip install boto") sudo("pip install requests") sudo("pip install taskcluster") sudo("apt-get install -y python-dev") # REQUIRED FOR psutil sudo("apt-get install -y build-essential") # REQUIRED FOR psutil sudo("pip install psutil") sudo("pip install pympler") sudo("pip install -r requirements.txt") Log.note("8") sudo("apt-get -y install python-psycopg2")
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def do_all(self): # ENSURE PARENT TRANSACTION IS UP TO DATE if self.parent: self.parent.do_all() try: # GET THE REMAINING COMMANDS with self.locker: todo = self.todo[self.complete:] self.complete = len(self.todo) # RUN THEM for c in todo: DEBUG and Log.note(FORMAT_COMMAND, command=c.command) self.db.db.execute(c.command) if c.command in [COMMIT, ROLLBACK]: Log.error("logic error") except Exception as e: Log.error("problem running commands", current=c, cause=e)
def do_all(self): # ENSURE PARENT TRANSACTION IS UP TO DATE c = None try: if self.parent == self: Log.warning("Transactions parent is equal to itself.") if self.parent: self.parent.do_all() # GET THE REMAINING COMMANDS with self.locker: todo = self.todo[self.complete:] self.complete = len(self.todo) # RUN THEM for c in todo: DEBUG and Log.note(FORMAT_COMMAND, command=c.command) self.db.db.execute(c.command) except Exception as e: Log.error("problem running commands", current=c, cause=e)