def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) edge.allowNulls = False self.fields = edge.domain.dimension.fields self.domain = self.edge.domain self.domain.limit = mo_math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list()
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = mo_math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = Painless[self.edge.value].partial_eval().to_es_script( self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp(self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [s for s in query.sort if s.value == edge.value] if sort_candidates: self.es_order = { "_term": { 1: "asc", -1: "desc" }[sort_candidates[0].sort] } else: self.es_order = None
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) edge.allowNulls = False self.fields = edge.domain.dimension.fields self.domain = self.edge.domain self.domain.limit = mo_math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list()
def temper_limit(proposed_limit, query): """ SUITABLE DEFAULTS AND LIMITS """ from jx_elasticsearch.es52.agg_bulk import is_bulk_agg from jx_elasticsearch.es52.set_bulk import is_bulk_set if is_bulk_agg(Null, query) or is_bulk_set(Null, query): return coalesce(proposed_limit, query.limit) else: return mo_math.min( coalesce(proposed_limit, query.limit, DEFAULT_LIMIT), MAX_LIMIT)
def wrap(query, container, namespace): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if is_op(query, QueryOp) or query == None: return query query = wrap(query) table = container.get_table(query['from']) schema = table.schema output = QueryOp(frum=table, format=query.format, limit=mo_math.min( MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT))) if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = DEFAULT_SELECT else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error( "You can not use both the `groupby` and `edges` clauses in the same query!" ) elif query.edges: output.edges = _normalize_edges(query.edges, limit=output.limit, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, limit=output.limit, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where({"and": listwrap(query.where)}, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) if not mo_math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def __getslice__(self, i, j): j = mo_math.min(j, len(self)) if j - 1 > 2**28: Log.error("Slice of {{num}} bytes is too big", num=j - i) try: self.file.seek(i) output = self.file.read(j - i).decode(self.encoding) return output except Exception as e: Log.error( "Can not read file slice at {{index}}, with encoding {{encoding}}", index=i, encoding=self.encoding, cause=e)
def __getslice__(self, i, j): j = mo_math.min(j, len(self)) if j - 1 > 2 ** 28: Log.error("Slice of {{num}} bytes is too big", num=j - i) try: self.file.seek(i) output = self.file.read(j - i).decode(self.encoding) return output except Exception as e: Log.error( "Can not read file slice at {{index}}, with encoding {{encoding}}", index=i, encoding=self.encoding, cause=e )
def wrap(query, container, namespace): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if is_op(query, QueryOp) or query == None: return query query = wrap(query) table = container.get_table(query['from']) schema = table.schema output = QueryOp( frum=table, format=query.format, limit=mo_math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) ) if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = DEFAULT_SELECT else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error("You can not use both the `groupby` and `edges` clauses in the same query!") elif query.edges: output.edges = _normalize_edges(query.edges, limit=output.limit, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, limit=output.limit, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where(query.where, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) if not mo_math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = mo_math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = Painless[self.edge.value].partial_eval().to_es_script(self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp(self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [s for s in query.sort if s.value == edge.value] if sort_candidates: self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]} else: self.es_order = None
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) if is_op(edge.value, LeavesOp): prefix = edge.value.term.var flatter = lambda k: literal_field(relative_field(k, prefix)) else: prefix = edge.value.var flatter = lambda k: relative_field(k, prefix) self.put, self.fields = transpose(*[ (flatter(untype_path(c.name)), c.es_column) for c in query.frum.schema.leaves(prefix) ]) self.domain = self.edge.domain = wrap({"dimension": {"fields": self.fields}}) self.domain.limit = mo_math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) if is_op(edge.value, LeavesOp): prefix = edge.value.term.var flatter = lambda k: literal_field(relative_field(k, prefix)) else: prefix = edge.value.var flatter = lambda k: relative_field(k, prefix) self.put, self.fields = transpose( *[(flatter(untype_path(c.name)), c.es_column) for c in query.frum.schema.leaves(prefix)]) self.domain = self.edge.domain = wrap( {"dimension": { "fields": self.fields }}) self.domain.limit = mo_math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False
on_clause = SQL_AND.join( quote_column(edge_alias, k) + SQL_EQ + sql for k, (t, sql) in zip(domain_names, edge_values)) null_on_clause = None elif query_edge.domain.type == "range": domain_name = "d" + text(edge_index) + "c0" domain_names = [ domain_name ] # ONLY EVER SEEN ONE DOMAIN VALUE, DOMAIN TUPLES CERTAINLY EXIST d = query_edge.domain if d.max == None or d.min == None or d.min == d.max: Log.error("Invalid range: {{range|json}}", range=d) if len(edge_names) == 1: domain = self._make_range_domain(domain=d, column_name=domain_name) limit = mo_math.min(query.limit, query_edge.domain.limit) domain += (SQL_ORDERBY + sql_list(vals) + SQL_LIMIT + text(limit)) where = None join_type = SQL_LEFT_JOIN if query_edge.allowNulls else SQL_INNER_JOIN on_clause = SQL_AND.join( quote_column(edge_alias) + SQL_DOT + k + " <= " + v + SQL_AND + v + " < (" + quote_column(edge_alias) + SQL_DOT + k + SQL_PLUS + text(d.interval) + ")" for k, (t, v) in zip(domain_names, edge_values)) null_on_clause = None elif query_edge.range: query_edge.allowNulls = False domain = self._make_range_domain(domain=d, column_name=domain_name)
def update_spot_requests(self): spot_requests = self._get_managed_spot_requests() # ADD UP THE CURRENT REQUESTED INSTANCES all_instances = UniqueIndex("id", data=self._get_managed_instances()) self.active = active = wrap([ r for r in spot_requests if r.status.code in RUNNING_STATUS_CODES | PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN ]) for a in active.copy(): if a.status.code == "request-canceled-and-instance-running" and all_instances[ a.instance_id] == None: active.remove(a) used_budget = 0 current_spending = 0 for a in active: about = self.price_lookup[a.launch_specification.instance_type, a.launch_specification.placement] discount = coalesce(about.type.discount, 0) Log.note( "Active Spot Request {{id}}: {{type}} {{instance_id}} in {{zone}} @ {{price|round(decimal=4)}}", id=a.id, type=a.launch_specification.instance_type, zone=a.launch_specification.placement, instance_id=a.instance_id, price=a.price - discount) used_budget += a.price - discount current_spending += coalesce(about.current_price, a.price) - discount Log.note( "Total Exposure: ${{budget|round(decimal=4)}}/hour (current price: ${{current|round(decimal=4)}}/hour)", budget=used_budget, current=current_spending) remaining_budget = self.settings.budget - used_budget current_utility = coalesce( SUM(self.price_lookup[ r.launch_specification.instance_type, r.launch_specification.placement].type.utility for r in active), 0) utility_required = self.instance_manager.required_utility( current_utility) net_new_utility = utility_required - current_utility Log.note( "have {{current_utility}} utility running; need {{need_utility}} more utility", current_utility=current_utility, need_utility=net_new_utility) if remaining_budget < 0: remaining_budget, net_new_utility = self.save_money( remaining_budget, net_new_utility) if net_new_utility < 0: if self.settings.allowed_overage: net_new_utility = mo_math.min( net_new_utility + self.settings.allowed_overage * utility_required, 0) net_new_utility = self.remove_instances(net_new_utility) if net_new_utility > 0: net_new_utility = mo_math.min(net_new_utility, self.settings.max_new_utility) net_new_utility, remaining_budget = self.add_instances( net_new_utility, remaining_budget) if net_new_utility > 0: Log.alert( "Can not fund {{num|round(places=2)}} more utility (all utility costs more than ${{expected|round(decimal=2)}}/hour). Remaining budget is ${{budget|round(decimal=2)}} ", num=net_new_utility, expected=self.settings.max_utility_price, budget=remaining_budget) # Give EC2 a chance to notice the new requests before tagging them. Till(seconds=3).wait() with self.net_new_locker: for req in self.net_new_spot_requests: req.add_tag("Name", self.settings.ec2.instance.name) Log.note("All requests for new utility have been made") self.done_making_new_spot_requests.go()
def add_instances(self, net_new_utility, remaining_budget): prices = self.pricing() for p in prices: if net_new_utility <= 0 or remaining_budget <= 0: break if p.current_price == None: Log.note("{{type}} has no current price", type=p.type.instance_type) continue if self.settings.utility[p.type.instance_type].blacklist or \ p.availability_zone in listwrap(self.settings.utility[p.type.instance_type].blacklist_zones): Log.note("{{type}} in {{zone}} skipped due to blacklist", type=p.type.instance_type, zone=p.availability_zone) continue # DO NOT BID HIGHER THAN WHAT WE ARE WILLING TO PAY max_acceptable_price = p.type.utility * self.settings.max_utility_price + p.type.discount max_bid = mo_math.min(p.higher_price, max_acceptable_price, remaining_budget) min_bid = p.price_80 if min_bid > max_acceptable_price: Log.note( "Price of ${{price}}/hour on {{type}}: Over remaining acceptable price of ${{remaining}}/hour", type=p.type.instance_type, price=min_bid, remaining=max_acceptable_price) continue elif min_bid > remaining_budget: Log.note( "Did not bid ${{bid}}/hour on {{type}}: Over budget of ${{remaining_budget}}/hour", type=p.type.instance_type, bid=min_bid, remaining_budget=remaining_budget) continue elif min_bid > max_bid: Log.error("not expected") naive_number_needed = int( mo_math.round(float(net_new_utility) / float(p.type.utility), decimal=0)) limit_total = None if self.settings.max_percent_per_type < 1: current_count = sum( 1 for a in self.active if a.launch_specification.instance_type == p.type.instance_type and a.launch_specification.placement == p.availability_zone) all_count = sum( 1 for a in self.active if a.launch_specification.placement == p.availability_zone) all_count = max(all_count, naive_number_needed) limit_total = int( mo_math.floor( (all_count * self.settings.max_percent_per_type - current_count) / (1 - self.settings.max_percent_per_type))) num = mo_math.min(naive_number_needed, limit_total, self.settings.max_requests_per_type) if num < 0: Log.note( "{{type}} is over {{limit|percent}} of instances, no more requested", limit=self.settings.max_percent_per_type, type=p.type.instance_type) continue elif num == 1: min_bid = mo_math.min( mo_math.max(p.current_price * 1.1, min_bid), max_acceptable_price) price_interval = 0 else: price_interval = mo_math.min(min_bid / 10, (max_bid - min_bid) / (num - 1)) for i in range(num): bid_per_machine = min_bid + (i * price_interval) if bid_per_machine < p.current_price: Log.note( "Did not bid ${{bid}}/hour on {{type}}: Under current price of ${{current_price}}/hour", type=p.type.instance_type, bid=bid_per_machine - p.type.discount, current_price=p.current_price) continue if bid_per_machine - p.type.discount > remaining_budget: Log.note( "Did not bid ${{bid}}/hour on {{type}}: Over remaining budget of ${{remaining}}/hour", type=p.type.instance_type, bid=bid_per_machine - p.type.discount, remaining=remaining_budget) continue last_no_capacity_message = self.no_capacity.get( p.type.instance_type, Null) if last_no_capacity_message > Date.now( ) - CAPACITY_NOT_AVAILABLE_RETRY: Log.note( "Did not bid on {{type}}: \"No capacity\" last seen at {{last_time|datetime}}", type=p.type.instance_type, last_time=last_no_capacity_message) continue try: if self.settings.ec2.request.count == None or self.settings.ec2.request.count != 1: Log.error( "Spot Manager can only request machine one-at-a-time" ) new_requests = self._request_spot_instances( price=bid_per_machine, availability_zone_group=p.availability_zone, instance_type=p.type.instance_type, kwargs=copy(self.settings.ec2.request)) Log.note( "Request {{num}} instance {{type}} in {{zone}} with utility {{utility}} at ${{price}}/hour", num=len(new_requests), type=p.type.instance_type, zone=p.availability_zone, utility=p.type.utility, price=bid_per_machine) net_new_utility -= p.type.utility * len(new_requests) remaining_budget -= (bid_per_machine - p.type.discount) * len(new_requests) with self.net_new_locker: for ii in new_requests: self.net_new_spot_requests.add(ii) except Exception as e: Log.warning( "Request instance {{type}} failed because {{reason}}", type=p.type.instance_type, reason=e.message, cause=e) if "Max spot instance count exceeded" in e.message: Log.note("No further spot requests will be attempted.") return net_new_utility, remaining_budget return net_new_utility, remaining_budget
def process_one(self, start, end, branch, please_stop): # ASSUME PREVIOUS WORK IS DONE # UPDATE THE DATABASE STATE self.done.min = mo_math.min(end, self.done.min) self.done.max = mo_math.max(start, self.done.max) self.set_state() try: pushes = make_push_objects(from_date=start.format(), to_date=end.format(), branch=branch) except MissingDataError: return except Exception as e: raise Log.error("not expected", cause=e) Log.note( "Found {{num}} pushes on {{branch}} in ({{start}}, {{end}})", num=len(pushes), start=start, end=end, branch=branch, ) data = [] try: for push in pushes: if please_stop: break with Timer("get tasks for push {{push}}", {"push": push.id}): try: schedulers = [ label.split("shadow-scheduler-")[1] for label in push.scheduled_task_labels if "shadow-scheduler" in label ] except Exception as e: Log.warning("could not get schedulers", cause=e) schedulers = [] scheduler = [] for s in schedulers: try: scheduler.append({ "name": s, "tasks": jx.sort(push.get_shadow_scheduler_tasks(s)), }) except Exception: pass try: regressions = push.get_regressions("label").keys() except Exception as e: regressions = [] Log.warning("could not get regressions for {{push}}", push=push.id, cause=e) # RECORD THE PUSH data.append({ "push": { "id": push.id, "date": push.date, "changesets": push.revs, "backedoutby": push.backedoutby, }, "schedulers": scheduler, "regressions": [{ "label": name } for name in jx.sort(regressions)], "branch": branch, "etl": { "revision": git.get_revision(), "timestamp": Date.now(), }, }) finally: # ADD WHATEVER WE HAVE with Timer("adding {{num}} records to bigquery", {"num": len(data)}): self.destination.extend(data)