def __init__(self, message="ping", every="second", start=None, until=None): if is_text(message): self.message = show_message(message) else: self.message = message self.every = Duration(every) if isinstance(until, Signal): self.please_stop = until elif until == None: self.please_stop = Signal() else: self.please_stop = Till(Duration(until).seconds) self.thread = None if start: self.thread = Thread.run( "repeat", _repeat, self.message, self.every, Date(start), parent_thread=MAIN_THREAD, please_stop=self.please_stop, ).release()
def __init__( self, interval, # TIME INTERVAL BETWEEN RUNS starting, # THE TIME TO START THE INTERVAL COUNT max_runtime=MAX_RUNTIME, # LIMIT HOW LONG THE PROCESS IS ALIVE wait_for_shutdown=WAIT_FOR_SHUTDOWN, # LIMIT PAITENCE WHEN ASKING FOR SHUTDOWN, THEN SEND KILL process=None, ): self.duration = Duration(interval) self.starting = coalesce(Date(starting), Date.now()) self.max_runtime = Duration(max_runtime) self.wait_for_shutdown = Duration(wait_for_shutdown) # Process parameters self.process = process # STATE self.last_started = None self.last_finished = None self.run_count = 0 self.fail_count = 0 self.current = None self.terminator = None # SIGNAL TO KILL THE PROCESS self.next_run = self._next_run() self.next = Till(till=self.next_run) self.next_run.then(self.run)
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds self.es = Cluster(kwargs).get_or_create_index( schema=json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, refresh_interval="1second", kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = randoms.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default( kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True), ) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def __init__(self, field, interval=DAY, expire=NEVER, flake=Null, kwargs=None): column = first(flake.leaves(field)) if not column: Log.error("expecting {{field}} in snowflake for partitioning", field=field) self.field = column.es_column self.interval = Duration(interval) self.expire = Duration(expire) if not isinstance(self.interval, Duration) or not isinstance( self.expire, Duration ): Log.error("expecting durations")
def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset(config.destination).get_or_create_table( config.destination ) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") self.etl_config_table = jx_sqlite.Container( config.config_db ).get_or_create_facts("etl-range") done_result = wrap(self.etl_config_table.query()).data prev_done = done_result[0] if len(done_result) and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.etl_config_table.add(self.done)
def queue_consumer(pull_queue, please_stop=None): queue = aws.Queue(pull_queue) time_offset = None request_count = 0 while not please_stop: request = queue.pop(till=please_stop) if please_stop: break if not request: Log.note("Nothing in queue, pausing for 5 seconds...") (please_stop | Till(seconds=5)).wait() continue if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch: Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request " + text_type(request_count), one_request, request) request_count += 1 queue.commit()
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[ u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests() if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def __init__(self, from_address, to_address, subject, region, aws_access_key_id=None, aws_secret_access_key=None, cc=None, log_type="ses", average_interval=HOUR, kwargs=None): """ SEND WARNINGS AND ERRORS VIA EMAIL settings = { "log_type": "ses", "from_address": "*****@*****.**", "to_address": "*****@*****.**", "cc":[ {"to_address":"*****@*****.**", "where":{"eq":{"template":"gr"}}} ], "subject": "[ALERT][STAGING] Problem in ETL", "aws_access_key_id": "userkey" "aws_secret_access_key": "secret" "region":"us-west-2" } """ assert kwargs.log_type == "ses", "Expecing settings to be of type 'ses'" self.settings = kwargs self.accumulation = [] self.cc = listwrap(cc) self.next_send = Date.now() + MINUTE self.locker = Lock() self.settings.average_interval = Duration(kwargs.average_interval)
def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset( config.destination).get_or_create_table(config.destination) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") prev_done = self.get_state() if prev_done and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.set_state()
def _request_spot_instances(self, price, availability_zone_group, instance_type, kwargs): kwargs.self = None kwargs.kwargs = None # m3 INSTANCES ARE NOT ALLOWED PLACEMENT GROUP if instance_type.startswith("m3."): kwargs.placement_group = None kwargs.network_interfaces = NetworkInterfaceCollection( *(NetworkInterfaceSpecification(**i) for i in listwrap(kwargs.network_interfaces) if self.vpc_conn.get_all_subnets( subnet_ids=i.subnet_id, filters={"availabilityZone": availability_zone_group}))) if len(kwargs.network_interfaces) == 0: Log.error( "No network interface specifications found for {{availability_zone}}!", availability_zone=kwargs.availability_zone_group) block_device_map = BlockDeviceMapping() # GENERIC BLOCK DEVICE MAPPING for dev, dev_settings in kwargs.block_device_map.items(): block_device_map[dev] = BlockDeviceType(delete_on_termination=True, **dev_settings) kwargs.block_device_map = block_device_map # INCLUDE EPHEMERAL STORAGE IN BlockDeviceMapping num_ephemeral_volumes = ephemeral_storage[instance_type]["num"] for i in range(num_ephemeral_volumes): letter = convert.ascii2char(98 + i) # START AT "b" kwargs.block_device_map["/dev/sd" + letter] = BlockDeviceType( ephemeral_name='ephemeral' + text(i), delete_on_termination=True) if kwargs.expiration: kwargs.valid_until = (Date.now() + Duration(kwargs.expiration)).format(ISO8601) kwargs.expiration = None # ATTACH NEW EBS VOLUMES for i, drive in enumerate(self.settings.utility[instance_type].drives): letter = convert.ascii2char(98 + i + num_ephemeral_volumes) device = drive.device = coalesce(drive.device, "/dev/sd" + letter) d = drive.copy() d.path = None # path AND device PROPERTY IS NOT ALLOWED IN THE BlockDeviceType d.device = None if d.size: kwargs.block_device_map[device] = BlockDeviceType( delete_on_termination=True, **d) output = list(self.ec2_conn.request_spot_instances(**kwargs)) return output
def setup( self, instance, # THE boto INSTANCE OBJECT FOR THE MACHINE TO SETUP utility # THE utility OBJECT FOUND IN CONFIG ): with self.locker: if not self.settings.setup_timeout: Log.error( "expecting instance.setup_timeout to prevent setup from locking" ) def worker(please_stop): cpu_count = int(round(utility.cpu)) with hide('output'): Log.note("setup {{instance}}", instance=instance.id) self._config_fabric(instance) Log.note("update packages on {{instance}} ip={{ip}}", instance=instance.id, ip=instance.ip_address) try: self._update_ubuntu_packages() except Exception as e: Log.warning( "Can not setup {{instance}}, type={{type}}", instance=instance.id, type=instance.instance_type, cause=e) return Log.note("setup etl on {{instance}}", instance=instance.id) self._setup_etl_code() Log.note("setup grcov on {{instance}}", instance=instance.id) self._setup_grcov() Log.note("add config file on {{instance}}", instance=instance.id) self._add_private_file() Log.note("setup supervisor on {{instance}}", instance=instance.id) self._setup_etl_supervisor(cpu_count) Log.note("setup done {{instance}}", instance=instance.id) worker_thread = Thread.run( "etl setup started at " + unicode(Date.now().format()), worker) (Till(timeout=Duration(self.settings.setup_timeout).seconds) | worker_thread.stopped).wait() if not worker_thread.stopped: Log.error("critical failure in thread {{name|quote}}", name=worker_thread.name) worker_thread.join()
def __init__(self, from_address, to_address, subject, region, aws_access_key_id=None, aws_secret_access_key=None, cc=None, log_type="ses", max_interval=HOUR, kwargs=None): assert kwargs.log_type == "ses", "Expecing settings to be of type 'ses'" self.settings = kwargs self.accumulation = [] self.cc = listwrap(cc) self.next_send = Date.now() + MINUTE self.locker = Lock() self.settings.max_interval = Duration(kwargs.max_interval)
def __init__( self, from_address, to_address, subject, host, username, password, port=465, use_ssl=1, cc=None, log_type="email", max_interval=HOUR, kwargs=None ): """ SEND WARNINGS AND ERRORS VIA EMAIL settings = { "log_type":"email", "from_address": "*****@*****.**", "to_address": "*****@*****.**", "cc":[ {"to_address":"*****@*****.**", "where":{"eq":{"template":"gr"}}} ], "subject": "Problem in Pulse Logger", "host": "mail.mozilla.com", "port": 465, "username": "******", "password": "******", "use_ssl": 1 } """ assert kwargs.log_type == "email", "Expecing settings to be of type 'email'" self.settings = kwargs self.accumulation = [] self.cc = listwrap(cc) self.next_send = Date.now() + MINUTE self.locker = Lock() self.settings.max_interval = Duration(kwargs.max_interval)
def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict(region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap( kwargs.aws.aws_secret_access_key)) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None self.price_lookup = None self.no_capacity = {} self.no_capacity_file = File( kwargs.price_file).parent / "no capacity.json" self.done_making_new_spot_requests = Signal() self.net_new_locker = Lock() self.net_new_spot_requests = UniqueIndex( ("id", )) # SPOT REQUESTS FOR THIS SESSION self.watcher = None self.active = None self.settings.uptime.bid_percentile = coalesce( self.settings.uptime.bid_percentile, self.settings.bid_percentile) self.settings.uptime.history = coalesce( Date(self.settings.uptime.history), DAY) self.settings.uptime.duration = coalesce( Duration(self.settings.uptime.duration), Date("5minute")) self.settings.max_percent_per_type = coalesce( self.settings.max_percent_per_type, 1) if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required( ): self._start_life_cycle_watcher() if not disable_prices: self.pricing()
def main(): since = Date.today() - Duration(SCATTER_RANGE) if config.database.host not in listwrap( config.analysis.expected_database_host): Log.error("Expecting database to be one of {{expected}}", expected=config.analysis.expected_database_host) if not config.analysis.interesting: Log.alert( "Expecting config file to have `analysis.interesting` with a json expression. All series are included." ) # SETUP DESTINATION deviant_summary = bigquery.Dataset( config.deviant_summary).get_or_create_table( read_only=True, kwargs=config.deviant_summary) if config.args.id: # EXIT EARLY AFTER WE GOT THE SPECIFIC IDS if len(config.args.id) < 4: step_detector.SHOW_CHARTS = True for signature_hash in config.args.id: process( signature_hash, since=since, source=config.database, deviant_summary=deviant_summary, show=True, ) return # DOWNLOAD if config.args.download: # GET INTERESTING SERIES where_clause = BQLang[jx_expression( config.analysis.interesting)].to_bq(deviant_summary.schema) # GET ALL KNOWN SERIES docs = list( deviant_summary.sql_query(f""" SELECT * EXCEPT (_rank, values) FROM ( SELECT *, row_number() over (partition by id order by last_updated desc) as _rank FROM {quote_column(deviant_summary.full_name)} ) a WHERE _rank=1 and {sql_iso(where_clause)} LIMIT {quote_value(DOWNLOAD_LIMIT)} """)) if len(docs) == DOWNLOAD_LIMIT: Log.warning("Not all signatures downloaded") File(config.args.download).write(list2tab(docs, separator=",")) # DEVIANT show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.deviant, show_old=False, show_distribution=True, ) # MODAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort="overall_dev_score", limit=config.args.modal, where={"eq": { "overall_dev_status": "MODAL" }}, show_distribution=True, ) # OUTLIERS show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "overall_dev_score", "sort": "desc" }, limit=config.args.outliers, where={"eq": { "overall_dev_status": "OUTLIERS" }}, show_distribution=True, ) # SKEWED show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.skewed, where={"eq": { "overall_dev_status": "SKEWED" }}, show_distribution=True, ) # OK show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.ok, where={"eq": { "overall_dev_status": "OK" }}, show_distribution=True, ) # NOISE show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "relative_noise" }, "sort": "desc" }, where={"gte": { "num_pushes": 30 }}, limit=config.args.noise, ) # EXTRA show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_extra_diff" }, "sort": "desc" }, where={"lte": { "num_new_segments": 7 }}, limit=config.args.extra, ) # MISSING show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_missing_diff" }, "sort": "desc" }, where={"lte": { "num_old_segments": 6 }}, limit=config.args.missing, ) # PATHOLOGICAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "num_segments", "sort": "desc" }, limit=config.args.pathological, )
def __init__(self, start, duration, child): self.duration = Duration(duration) self.start = self.last_value = Date(start).floor(self.duration) self.batch = 0 self.child = child
def _cleaner(self, please_stop): while not please_stop: ( please_stop | Till(seconds=Duration(WRITE_INTERVAL).total_seconds()) ).wait() self.clean()
def parse_time_interval(self, interval: str) -> timedelta: duration = Duration(interval) return timedelta(seconds=duration.total_seconds())
from mo_threads import Lock, Signal, Thread, Till from mo_threads.threads import MAIN_THREAD from mo_times import DAY, Date, Duration, HOUR, MINUTE, SECOND, Timer, WEEK from pyLibrary import convert from pyLibrary.meta import cache, new_instance _please_import = http SINGLE_THREAD_SETUP = False ENABLE_SIDE_EFFECTS = True ALLOW_SHUTDOWN = True DEBUG_PRICING = True TIME_FROM_RUNNING_TO_LOGIN = 7 * MINUTE ERROR_ON_CALL_TO_SETUP = "Problem with setup()" DELAY_BEFORE_SETUP = 1 * MINUTE # PROBLEM WITH CONNECTING ONLY HAPPENS WITH BIGGER ES MACHINES CAPACITY_NOT_AVAILABLE_RETRY = Duration( "day") # SOME MACHINES ARE NOT AVAILABLE class SpotManager(object): @override def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict(region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap( kwargs.aws.aws_secret_access_key)) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None
set_default, ) from mo_json import value2json, json2value from mo_logs import startup, constants, Log from mo_threads import Process, Till from mo_threads.repeat import Repeat from mo_times import Date, Duration, Timer, MINUTE from pyLibrary.env import git from pyLibrary.meta import extend MAX_RUNTIME = "50minute" # STOP PROCESSING AFTER THIS GIVEN TIME DEFAULT_START = "today-2day" LOOK_BACK = 30 LOOK_FORWARD = 30 CACHY_STATE = "cia-tasks/etl/schedules" CACHY_RETENTION = Duration("30day") / MINUTE SHOW_S3_CACHE_HIT = True SECRET_PREFIX = "project/cia/smart-scheduling" SECRET_NAMES = [ "destination.account_info", ] def inject_secrets(config): """ INJECT THE SECRETS INTO THE CONFIGURATION :param config: CONFIG DATA ************************************************************************ ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET: ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com
def process( about_deviant, since, source, deviant_summary, show=False, show_limit=MAX_POINTS, show_old=False, show_distribution=None, ): """ :param signature_hash: The performance hash :param since: Only data after this date :param show: :param show_limit: :param show_old: :param show_distribution: :return: """ sig_id = about_deviant.id if not isinstance(sig_id, int): Log.error("expecting id") # GET SIGNATURE DETAILS sig = get_signature(db_config=source, signature_id=sig_id) # GET SIGNATURE DETAILS data = get_dataum(source, sig.id, since=since, limit=show_limit) min_date = since.unix pushes = jx.sort( [{ "value": median(rows.value), "runs": rows, "push": { "time": unwrap(t)["push.time"] }, } for t, rows in jx.groupby(data, "push.time") if t["push\\.time"] > min_date], "push.time", ) values = list(pushes.value) title = "-".join( map( str, [ sig.id, sig.framework, sig.suite, sig.test, sig.repository, sig.platform, about_deviant.overall_dev_status, ], )) # EG https://treeherder.mozilla.org/perf.html#/graphs?highlightAlerts=1&series=mozilla-central,fee739b45f7960e4a520d8e0bd781dd9d0a3bec4,1,10&timerange=31536000 url = "https://treeherder.mozilla.org/perf.html#/graphs?" + value2url_param( { "highlightAlerts": 1, "series": [ sig.repository, sig.id, 1, coalesce(sig.framework_id, sig.framework) ], "timerange": Duration(TREEHERDER_RANGE).seconds }) Log.note("With {{title}}: {{url}}", title=title, url=url) with Timer("find segments"): new_segments, new_diffs = find_segments(values, sig.alert_change_type, sig.alert_threshold) # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS old_segments = tuple( sorted( set([ i for i, p in enumerate(pushes) if any(r.alert.id for r in p.runs) ] + [0, len(pushes)]))) old_medians = [0.0] + [ np.median(values[s:e]) for s, e in zip(old_segments[:-1], old_segments[1:]) ] old_diffs = np.array( [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0]) if len(new_segments) == 1: overall_dev_status = None overall_dev_score = None last_mean = None last_std = None last_dev_status = None last_dev_score = None relative_noise = None Log.note("not ") else: # NOISE OF LAST SEGMENT s, e = new_segments[-2], new_segments[-1] last_segment = np.array(values[s:e]) ignore = IGNORE_TOP trimmed_segment = last_segment[np.argsort(last_segment) [ignore:-ignore]] last_mean = np.mean(trimmed_segment) last_std = np.std(trimmed_segment) last_dev_status, last_dev_score = deviance(trimmed_segment) relative_noise = last_std / last_mean # FOR EACH SEGMENT, NORMALIZE MEAN AND VARIANCE normalized = [] for s, e in jx.pairs(new_segments): data = np.array(values[s:e]) norm = (data + last_mean - np.mean(data)) * last_std / np.std(data) normalized.extend(norm) overall_dev_status, overall_dev_score = deviance(normalized) Log.note( "\n\tdeviance = {{deviance}}\n\tnoise={{std}}\n\tpushes={{pushes}}\n\tsegments={{num_segments}}", title=title, deviance=(overall_dev_status, overall_dev_score), std=relative_noise, pushes=len(values), num_segments=len(new_segments) - 1, ) if show_distribution: histogram(trimmed_segment, title=last_dev_status + "=" + text(last_dev_score)) max_extra_diff = None max_missing_diff = None _is_diff = is_diff(new_segments, old_segments) if _is_diff: # FOR MISSING POINTS, CALC BIGGEST DIFF max_extra_diff = mo_math.MAX( abs(d) for s, d in zip(new_segments, new_diffs) if all(not (s - TOLERANCE <= o <= s + TOLERANCE) for o in old_segments)) max_missing_diff = mo_math.MAX( abs(d) for s, d in zip(old_segments, old_diffs) if all(not (s - TOLERANCE <= n <= s + TOLERANCE) for n in new_segments)) Log.alert( "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}", max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, ) Log.note("old={{old}}, new={{new}}", old=old_segments, new=new_segments) else: Log.note("Agree") if show and len(pushes): show_old and assign_colors(values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) if url: webbrowser.open(url) if isinstance(deviant_summary, bigquery.Table): Log.note("BigQuery summary not updated") return deviant_summary.upsert( where={"eq": { "id": sig.id }}, doc=Data( id=sig_id, title=title, num_pushes=len(values), num_segments=len(new_segments) - 1, relative_noise=relative_noise, overall_dev_status=overall_dev_status, overall_dev_score=overall_dev_score, last_mean=last_mean, last_std=last_std, last_dev_status=last_dev_status, last_dev_score=last_dev_score, last_updated=Date.now(), is_diff=_is_diff, max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, num_new_segments=len(new_segments), num_old_segments=len(old_segments), ), )
def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining)
def main(): try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) # SHUNT PYTHON LOGGING TO MAIN LOGGING capture_logging() # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add capture_loguru() if config.taskcluster: inject_secrets(config) @extend(Configuration) def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"]))) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": { "driver": "null" }}) object.__setattr__(self, "cache", CustomCacheManager(self._config)) for _, store in self._config["cache"]["stores"].items(): if store.path and not store.path.endswith("/"): # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY store.path = store.path + "/" if SHOW_S3_CACHE_HIT: s3_get = S3Store._get @extend(S3Store) def _get(self, key): with Timer("get {{key}} from S3", {"key": key}, verbose=False) as timer: output = s3_get(self, key) if output is not None: timer.verbose = True return output # UPDATE ADR CONFIGURATION with Repeat("waiting for ADR", every="10second"): adr.config.update(config.adr) # DUMMY TO TRIGGER CACHE make_push_objects(from_date=Date.today().format(), to_date=Date.now().format(), branch="autoland") outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds()) outatime.then(lambda: Log.alert("Out of time, exit early")) Schedulers(config).process(outatime) except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()