def _build_list_sql(self, db, first, batch_size): # TODO: ENSURE THE LAST COLUMN IS THE id if first: dim = len(self._extract.field) where = SQL_OR.join( sql_iso( sql_and( quote_column(f) + ineq(i, e, dim) + db.quote_value(Date(v) if t == "time" else v) for e, (f, v, t) in enumerate( zip(self._extract.field[0:i + 1:], first, self._extract.type[0:i + 1:])))) for i in range(dim)) else: where = SQL_TRUE selects = [] for t, f in zip(self._extract.type, self._extract.field): if t == "time": selects.append( "CAST" + sql_iso(sql_alias(quote_column(f), SQL("DATETIME(6)")))) else: selects.append(quote_column(f)) sql = (SQL_SELECT + sql_list(selects) + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + where + SQL_ORDERBY + sql_list(quote_column(f) for f in self._extract.field) + SQL_LIMIT + db.quote_value(batch_size)) return sql
def __init__(self, message="ping", every="second", start=None, until=None): if is_text(message): self.message = show_message(message) else: self.message = message self.every = Duration(every) if isinstance(until, Signal): self.please_stop = until elif until == None: self.please_stop = Signal() else: self.please_stop = Till(Duration(until).seconds) self.thread = None if start: self.thread = Thread.run( "repeat", _repeat, self.message, self.every, Date(start), parent_thread=MAIN_THREAD, please_stop=self.please_stop, ).release()
def to_es_script(self, schema): return EsScript( type=NUMBER, expr=text_type(Date(self.value).unix), frum=self, schema=schema )
def __init__( self, interval, # TIME INTERVAL BETWEEN RUNS starting, # THE TIME TO START THE INTERVAL COUNT max_runtime=MAX_RUNTIME, # LIMIT HOW LONG THE PROCESS IS ALIVE wait_for_shutdown=WAIT_FOR_SHUTDOWN, # LIMIT PAITENCE WHEN ASKING FOR SHUTDOWN, THEN SEND KILL process=None, ): self.duration = Duration(interval) self.starting = coalesce(Date(starting), Date.now()) self.max_runtime = Duration(max_runtime) self.wait_for_shutdown = Duration(wait_for_shutdown) # Process parameters self.process = process # STATE self.last_started = None self.last_finished = None self.run_count = 0 self.fail_count = 0 self.current = None self.terminator = None # SIGNAL TO KILL THE PROCESS self.next_run = self._next_run() self.next = Till(till=self.next_run) self.next_run.then(self.run)
def setUpClass(self): while True: try: es = test_jx.global_settings.backend_es http.get_json(URL(es.host, port=es.port)) break except Exception as e: e = Except.wrap(e) if "No connection could be made because the target machine actively refused it" in e or "Connection refused" in e: Log.alert("Problem connecting") else: Log.error("Server raised exception", e) # REMOVE OLD INDEXES cluster = elasticsearch.Cluster(test_jx.global_settings.backend_es) aliases = cluster.get_aliases() for a in aliases: try: if a.index.startswith("testing_"): create_time = Date( a.index[-15:], "%Y%m%d_%H%M%S" ) # EXAMPLE testing_0ef53e45b320160118_180420 if create_time < Date.now() - 10 * MINUTE: cluster.delete_index(a.index) except Exception as e: Log.warning("Problem removing {{index|quote}}", index=a.index, cause=e)
def test_groupby_expression_and_sort(self): test = { "data": [ {"a": Date("2018-04-01 12:34:00").unix, "value": 1}, {"a": Date("2018-04-01 13:34:00").unix, "value": 3}, {"a": Date("2018-04-01 15:34:00").unix, "value": 4}, {"a": Date("2018-04-01 08:34:00").unix, "value": 6}, {"a": Date("2018-04-02 00:34:00").unix, "value": 7}, {"value": 99}, {"a": Date("2018-04-02 01:34:00").unix, "value": 8}, {"a": Date("2018-04-02 02:44:00").unix, "value": 9}, {"a": Date("2018-04-02 04:54:00").unix, "value": 10}, {"a": Date("2018-04-02 14:04:00").unix, "value": 11} ], "query": { "from": TEST_TABLE, "groupby": { "name": "date", "value": {"floor": [{"div": ["a", 86400]}]} }, "sort": {"value": {"floor": [{"div": ["a", 86400]}]}} }, "expecting_list": { "meta": {"format": "list"}, "data": [ {"date": 17622, "count": 4}, {"date": 17623, "count": 5}, {"count": 1} ] }, "expecting_table": { "meta": {"format": "table"}, "header": ["date", "count"], "data": [ [17622, 4], [17623, 5], [NULL, 1] ] }, "expecting_cube": { "meta": {"format": "cube"}, "edges": [{"name": "date", "domain": {"type": "set", "partitions": [ {"value": 17622}, {"value": 17623} ]}}], "data": { "count": [4, 5, 1] } } } self.utils.execute_tests(test)
def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset(config.destination).get_or_create_table( config.destination ) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") self.etl_config_table = jx_sqlite.Container( config.config_db ).get_or_create_facts("etl-range") done_result = wrap(self.etl_config_table.query()).data prev_done = done_result[0] if len(done_result) and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.etl_config_table.add(self.done)
def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset( config.destination).get_or_create_table(config.destination) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") prev_done = self.get_state() if prev_done and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.set_state()
def next(self, value): v = Date(value[0]) if self.last_value.floor(self.duration) > v: Log.error("Expecting strictly increasing") self.last_value = v key = Math.round((v.floor(self.duration) - self.start) / self.duration, decimal=0) if key != self.batch: self.child.reset() self.batch = key c = self.child.next(value[1:]) return [self.batch] + c
def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict(region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap( kwargs.aws.aws_secret_access_key)) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None self.price_lookup = None self.no_capacity = {} self.no_capacity_file = File( kwargs.price_file).parent / "no capacity.json" self.done_making_new_spot_requests = Signal() self.net_new_locker = Lock() self.net_new_spot_requests = UniqueIndex( ("id", )) # SPOT REQUESTS FOR THIS SESSION self.watcher = None self.active = None self.settings.uptime.bid_percentile = coalesce( self.settings.uptime.bid_percentile, self.settings.bid_percentile) self.settings.uptime.history = coalesce( Date(self.settings.uptime.history), DAY) self.settings.uptime.duration = coalesce( Duration(self.settings.uptime.duration), Date("5minute")) self.settings.max_percent_per_type = coalesce( self.settings.max_percent_per_type, 1) if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required( ): self._start_life_cycle_watcher() if not disable_prices: self.pricing()
def test_django_cannot_encode_datetime(extract_job_settings): """ DJANGO DOES NOT ENCODE THE DATETIME PROPERLY """ epoch = Date(Date.EPOCH).datetime get_ids = SQL( str((Job.objects.filter( Q(last_modified__gt=epoch) | (Q(last_modified=epoch) & Q(id__gt=0))).annotate().values("id").order_by( "last_modified", "id")[:2000]).query)) source = MySQL(extract_job_settings.source.database) with pytest.raises(Exception): with source.transaction(): list(source.query(get_ids, stream=True, row_tuples=True))
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break # if untype_path(column.name) in ["build.type", "run.type"]: # Log.note("found") if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now()
def average_weekly(y, year): # RETURN AVERAGE OVER YEAR ENDING JULY 1 min = Date(year).floor(YEAR) - 6 * MONTH max = min + YEAR max_seen = min acc = 0 for value, (start, stop) in zip(deaths[y], _death_dates): if is_nan(value): continue if min < stop < max: max_seen = stop if min < start < max: acc += value else: ratio = (stop - min) / WEEK acc += value * ratio elif min < start < max: max_seen = max ratio = (max - start) / WEEK acc += value * ratio if is_nan(acc): Log.error("not expected") return acc * WEEK / (max_seen - min)
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column: if column.es_index in self.index_does_not_exist: with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith( TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
population_yaxis = {"range": [0, populations_max_y]} fig = go.Figure(data=[ go.Bar(name="0-44", x=populations[DATE_COLUMN], y=populations["00"]), go.Bar(name="45-64", x=populations[DATE_COLUMN], y=populations["45"]), go.Bar(name="65-84", x=populations[DATE_COLUMN], y=populations["65"]), go.Bar(name="85+", x=populations[DATE_COLUMN], y=populations["85"]), ]) fig.update_layout( title="Population, " + PROVINCE_NAME, barmode="stack", yaxis=population_yaxis ) fig.show() recent_year_index = populations.shape[0] - 1 # INDEX OF LAST POPULATION COUNT recent_year_name = populations.refPer[recent_year_index][:4] _population_dates = [Date(d) for i, d in enumerate(populations[DATE_COLUMN])] def get_population(y, date): """ RETURN POPULATION AT GIVEN DATE :param date: :param y: WHICH POPULATION """ for i, next in enumerate(_population_dates): if date < next: prev = _population_dates[i - 1] y1, y2 = populations[y][i - 1 : i + 1] ratio = (date - prev) / (next - prev) return (y2 - y1) * ratio + y1
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.then(lambda: self.todo.add(THREAD_STOP)) while not please_stop: pair = self.todo.pop() if pair is THREAD_STOP: break column, after = pair with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: # DEBUG and Log.note("{{column.es_column}} is a struct", column=column) continue elif after and column.last_updated > after: continue # COLUMN IS STILL YOUNG elif column.last_updated > Date.now() - TOO_OLD and column.cardinality > 0: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue if untype_path(column.name) in KNOWN_MULTITYPES: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) continue self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} })
def monitor(self, please_stop): please_stop.then(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: # LOOK FOR OLD COLUMNS WE CAN RE-SCAN now = Date.now() last_good_update = now - MAX_COLUMN_METADATA_AGE old_columns = [ c for c in self.meta.columns if (c.last_updated < last_good_update) and c.jx_type not in STRUCT and c.es_index != META_COLUMNS_NAME ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend((c, max(last_good_update, c.last_updated)) for c in old_columns) else: DEBUG and Log.note("no more metatdata to update") META_COLUMNS_DESC.last_updated = now pair = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if pair: if pair is THREAD_STOP: continue column, after = pair now = Date.now() with Timer("review {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} of {{column.es_index}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: # DEBUG and Log.note("{{column.es_column}} is a struct, not scanned", column=column) column.last_updated = now continue elif column.cardinality is None: pass # NO CARDINALITY MEANS WE MUST GET UPDATE IT elif after and column.last_updated < after: pass # COLUMN IS TOO OLD elif column.last_updated < now - TOO_OLD: pass # COLUMN IS WAY TOO OLD else: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(now-Date(column.last_updated))) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) META_COLUMNS_DESC.last_updated = now except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def __init__(self, start, duration, child): self.duration = Duration(duration) self.start = self.last_value = Date(start).floor(self.duration) self.batch = 0 self.child = child
def life_cycle_watcher(please_stop): bad_requests = Data() setup_threads = [] last_get = Date.now() setup_in_progress = set() while not please_stop: spot_requests = self._get_managed_spot_requests() instances = wrap({ i.id: i for r in self.ec2_conn.get_all_instances() for i in r.instances }) # INSTANCES THAT REQUIRE SETUP time_to_stop_trying = {} please_setup = [ (i, r) for i, r in [(instances[r.instance_id], r) for r in spot_requests] if i.id and (not i.tags.get("Name") or i.tags.get( "Name") == self.settings.ec2.instance.name + " (setup)") and i.id not in setup_in_progress and i._state.name == "running" and Date.now() > Date(i.launch_time) + DELAY_BEFORE_SETUP ] for i, r in please_setup: if not time_to_stop_trying.get(i.id): time_to_stop_trying[ i.id] = Date.now() + TIME_FROM_RUNNING_TO_LOGIN if Date.now() > time_to_stop_trying[i.id]: # FAIL TO SETUP AFTER x MINUTES, THEN TERMINATE INSTANCE self.ec2_conn.terminate_instances(instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) Log.warning( "Problem with setup of {{instance_id}}. Time is up. Instance TERMINATED!", instance_id=i.id) continue try: p = self.settings.utility[i.instance_type] if p == None: try: self.ec2_conn.terminate_instances( instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) finally: Log.error( "Can not setup unknown {{instance_id}} of type {{type}}", instance_id=i.id, type=i.instance_type) i.markup = p i.add_tag("Name", self.settings.ec2.instance.name + " (setup)") setup_in_progress.add(i.id) t = Thread.run("setup for " + text(i.id), track_setup, self.instance_manager.setup, r, i, p) if SINGLE_THREAD_SETUP: t.join() setup_threads.append(t) except Exception as e: i.add_tag("Name", "") Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e) if Date.now() - last_get > 5 * SECOND: # REFRESH STALE spot_requests = self._get_managed_spot_requests() last_get = Date.now() pending = wrap([ r for r in spot_requests if r.status.code in PENDING_STATUS_CODES ]) give_up = wrap([ r for r in spot_requests if (r.status.code in PROBABLY_NOT_FOR_A_WHILE | TERMINATED_STATUS_CODES) and r.id not in bad_requests ]) ignore = wrap([ r for r in spot_requests if r.status.code in MIGHT_HAPPEN ]) # MIGHT HAPPEN, BUT NO NEED TO WAIT FOR IT if self.done_making_new_spot_requests: with self.net_new_locker: expired = Date.now( ) - self.settings.run_interval + 2 * MINUTE for ii in list(self.net_new_spot_requests): if Date(ii.create_time) < expired: # SOMETIMES REQUESTS NEVER GET INTO THE MAIN LIST OF REQUESTS self.net_new_spot_requests.remove(ii) for g in ignore: self.net_new_spot_requests.remove(g.id) pending = UniqueIndex(("id", ), data=pending) pending = pending | self.net_new_spot_requests if give_up: self.ec2_conn.cancel_spot_instance_requests( request_ids=give_up.id) Log.note( "Cancelled spot requests {{spots}}, {{reasons}}", spots=give_up.id, reasons=give_up.status.code) for g in give_up: bad_requests[g.id] += 1 if g.id in self.net_new_spot_requests: self.net_new_spot_requests.remove(g.id) if g.status.code == "capacity-not-available": self.no_capacity[ g.launch_specification. instance_type] = Date.now() if g.status.code == "bad-parameters": self.no_capacity[ g.launch_specification. instance_type] = Date.now() Log.warning( "bad parameters while requesting type {{type}}", type=g.launch_specification. instance_type) if not pending and self.done_making_new_spot_requests: Log.note("No more pending spot requests") break elif pending: Log.note("waiting for spot requests: {{pending}}", pending=[p.id for p in pending]) (Till(seconds=10) | please_stop).wait() with Timer("Save no capacity to file"): table = [{ "instance_type": k, "last_failure": v } for k, v in self.no_capacity.items()] self.no_capacity_file.write(value2json(table, pretty=True)) # WAIT FOR SETUP TO COMPLETE for t in setup_threads: t.join() Log.note("life cycle watcher has stopped")
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={ "table": column.es_index, "column": column.es_column }, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note( "{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note( "{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) else: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def _get_spot_prices_from_aws(self): with Timer("Read no capacity file"): try: # FILE IS LIST OF {instance_type, last_failure} OBJECTS content = self.no_capacity_file.read() self.no_capacity = dict( (r.instance_type, r.last_failure) for r in convert.json2value( content, flexible=False, leaves=False)) except Exception as e: self.no_capacity = {} with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() cache = ListContainer(name=None, data=cache) most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": { "value": "timestamp", "aggregate": "max" } }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX( [Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note( "get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce( self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token) next_token = resultset.next_token for p in resultset: prices.add( wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter( prices, {"gte": { "timestamp": { "date": "today-2day" } }}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return ListContainer(name="prices", data=prices)
def pull_all_remaining(self, please_stop): try: try: content = File(self.settings.extract.last).read_json() if len(content) == 1: Log.note("Got a manually generated file {{filename}}", filename=self.settings.extract.last) start_point = tuple(content[0]) first_value = [ self._extract.start[0] + (start_point[0] * DAY), start_point[1] ] else: Log.note("Got a machine generated file {{filename}}", filename=self.settings.extract.last) start_point, first_value = content start_point = tuple(start_point) Log.note("First value is {{start1|date}}, {{start2}}", start1=first_value[0], start2=first_value[1]) except Exception as _: Log.error( "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]", filename=self.settings.extract.last) start_point = tuple(self._extract.start) first_value = Null counter = Counter(start=0) for t, s, b, f, i in reversed( zip(self._extract.type, self._extract.start, self._extract.batch, listwrap(first_value) + DUMMY_LIST, range(len(self._extract.start)))): if t == "time": counter = DurationCounter(start=s, duration=b, child=counter) first_value[i] = Date(f) else: counter = BatchCounter(start=s, size=b, child=counter) batch_size = self._extract.batch.last( ) * 2 * self.settings.extract.threads with MySQL(**self.settings.snowflake.database) as db: while not please_stop: sql = self._build_list_sql(db, first_value, batch_size + 1) pending = [] counter.reset(start_point) with Timer("Grab a block of ids for processing"): with closing(db.db.cursor()) as cursor: acc = [] cursor.execute(sql) count = 0 for row in cursor: detail_key = counter.next(row) key = tuple(detail_key[:-1]) count += 1 if key != start_point: if first_value: if not acc: Log.error( "not expected, {{filename}} is probably set too far in the past", filename=self.settings.extract. last) pending.append({ "start_point": start_point, "first_value": first_value, "data": acc }) acc = [] start_point = key first_value = row acc.append( row[-1] ) # ASSUME LAST COLUMN IS THE FACT TABLE id Log.note("adding {{num}} for processing", num=len(pending)) self.queue.extend(pending) if count < batch_size: self.queue.add(THREAD_STOP) break except Exception as e: Log.warning("Problem pulling data", cause=e) finally: self.done_pulling.go() Log.note("pulling new data is done")
def sql_time(time): return sql_call("TIMESTAMP_MICROS", quote_value(int(Date(time).unix * 1000000)))
def _path(timestamp): return Date(timestamp).format("%Y/%m/%d")
def complex_job( transactional_db, generic_reference_data, test_repository, extract_job_settings, now ): fc = FailureClassification.objects.create(id=1, name="not classified") repository_group = RepositoryGroup.objects.create(name="common") repo = Repository.objects.create(name="autoland", repository_group=repository_group) push = Push.objects.create( **{ "author": "*****@*****.**", "repository": repo, "revision": "ae6bb3a1066959a8c43d003a3caab0af769455bf", "time": unix2datetime(1578427105).replace(tzinfo=None), } ) Commit.objects.create( push=push, revision="ae6bb3a1066959a8c43d003a3caab0af769455bf", author="*****@*****.**", comments="no comment", ) Commit.objects.create( push=push, revision="0123456789012345678901234567890123456789", author="*****@*****.**", comments="no comment2", ) debug = Option.objects.create(name="debug") oc = OptionCollection.objects.create(option_collection_hash=Random.base64(5), option=debug) job = Job.objects.create( autoclassify_status=1, guid=Random.base64(20), repository=test_repository, push_id=push.id, signature=generic_reference_data.signature, build_platform=generic_reference_data.build_platform, machine_platform=generic_reference_data.machine_platform, machine=generic_reference_data.machine, option_collection_hash=oc.option_collection_hash, job_type=generic_reference_data.job_type, job_group=generic_reference_data.job_group, product=generic_reference_data.product, failure_classification_id=fc.id, who="*****@*****.**", reason="scheduled", result="success", state="completed", submit_time=unix2datetime(1578427253).replace(tzinfo=None), start_time=unix2datetime(1578430841).replace(tzinfo=None), last_modified=unix2datetime(1578432686.364459).replace(tzinfo=None), end_time=unix2datetime(1578432680).replace(tzinfo=None), tier=1, ) text_log_step = TextLogStep.objects.create( job=job, **{ "finished_line_number": 88739, "name": "Unnamed step", "result": 7, "started_line_number": 0, }, ) TextLogError.objects.create( step=text_log_step, line="line contents here", line_number=619845839 ) TextLogError.objects.create(step=text_log_step, line="ERROR! more line contents", line_number=6) TaskclusterMetadata.objects.create(job=job, retry_id=0, task_id="WWb9ExAvQUa78ku0DIxdSQ") JobLog.objects.create( **{ "job_id": job.id, "name": "builds-4h", "status": 1, "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/logs/live_backing.log", } ) job_logs1 = JobLog.objects.create( **{ "job_id": job.id, "name": "errorsummary_json", "status": 1, "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/test_info/wpt_errorsummary.log", } ) bcf = ClassifiedFailure.objects.create(**{"bug_number": 1234567,}) bcf.created = Date("2020-01-17 12:00:00").datetime bcf.save() FailureLine.objects.create( job_log=job_logs1, **{ "action": "test_groups", "best_classification": bcf, "best_is_verified": True, "repository": repo, "job_guid": job.guid, "line": 15, "modified": 0, "stackwalk_stderr": 1578432686, "stackwalk_stdout": 1578432686, }, ) FailureLine.objects.create( job_log=job_logs1, **{ "action": "crash", "best_classification": bcf, "best_is_verified": False, "repository": repo, "job_guid": job.guid, "line": 24031, "modified": 0, "signature": "@ mozilla::dom::CustomElementData::SetCustomElementDefinition(mozilla::dom::CustomElementDefinition*)", "stackwalk_stderr": 1578432686, "stackwalk_stdout": 1578432686, "test": "/custom-elements/upgrading.html", }, ) return job
def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining)