def download_perfherder(desc, repo, id, dummy, framework): sig_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/signatures/?format=json&framework=" + str(framework) + "&id=" + str(id) ) signature = first(sig_result.keys()) data_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/data/?signatures=" + signature ) Log.note( "{{result|json}}", result={ "name": desc, "data": jx.run({ "from": ListContainer("data", data_result[signature]), "sort": "push_timestamp", "select": "value" }).data }, )
def query(self, _query): try: query = QueryOp.wrap(_query, table=self) for n in self.namespaces: query = n.convert(query) for s in listwrap(query.select): if not aggregates.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, aggregate=s.aggregate ) frum = query["from"] if isinstance(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_deepop(self._es, query): return es_deepop(self._es, query) if is_aggsop(self._es, query): return es_aggsop(self._es, frum, query) if is_setop(self._es, query): return es_setop(self._es, query) Log.error("Can not handle") except Exception as e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self._es.cluster.path+"/_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def get_raw_json(path): active_data_timer = Timer("total duration") body = flask.request.get_data() try: with active_data_timer: args = scrub_args(flask.request.args) limit = args.limit if args.limit else 10 args.limit = None frum = find_container(path, after=None) result = jx.run( { "from": path, "where": { "eq": args }, "limit": limit, "format": "list" }, frum) if isinstance( result, Container ): # TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format("list") result.meta.active_data_response_time = active_data_timer.duration response_data = value2json(result.data, pretty=True).encode('utf8') Log.note("Response is {{num}} bytes", num=len(response_data)) return Response(response_data, status=200) except Exception as e: e = Except.wrap(e) return send_error(active_data_timer, body, e)
def query(self, _query): try: query = QueryOp.wrap(_query, container=self, namespace=self.namespace) for s in listwrap(query.select): if s.aggregate != None and not aggregates.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, aggregate=s.aggregate ) frum = query["from"] if isinstance(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_deepop(self.es, query): return es_deepop(self.es, query) if is_aggsop(self.es, query): return es_aggsop(self.es, frum, query) if is_setop(self.es, query): return es_setop(self.es, query) Log.error("Can not handle") except Exception as e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self.es.cluster.url / "_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def test_in_w_multi_value(self): data = [ {"a": "e"}, {"a": "c"}, {"a": ["e"]}, {"a": ["c"]}, {"a": ["e", "c"]}, {} ] result = jx.run({ "from": ListContainer(".", data), "select": [ "a", {"name": "is_e", "value": {"when": {"in": [{"literal": "e"}, "a"]}, "then": 1, "else": 0}}, {"name": "not_e", "value": {"when": {"not": {"in": [{"literal": "e"}, "a"]}}, "then": 1, "else": 0}}, {"name": "is_c", "value": {"when": {"in": [{"literal": "c"}, "a"]}, "then": 1, "else": 0}} ] }) expected = {"data": [ {"a": "e", "is_e": 1, "not_e": 0, "is_c": 0}, {"a": "c", "is_e": 0, "not_e": 1, "is_c": 1}, {"a": "e", "is_e": 1, "not_e": 0, "is_c": 0}, {"a": "c", "is_e": 0, "not_e": 1, "is_c": 1}, {"a": ["e", "c"], "is_e": 1, "not_e": 0, "is_c": 1}, {"a": NULL, "is_e": 0, "not_e": 1, "is_c": 0} ]} self.assertAlmostEqual(result, expected)
def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema(".", [ c for cs in self.data["meta.columns"].values() for c in cs ]) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query)
def get(*args, **kwargs): body = kwargs.get("data") if not body: return wrap({"status_code": 400}) text = utf82unicode(body) data = json2value(text) result = jx.run(data) output_bytes = unicode2utf8(value2json(result)) return wrap({ "status_code": 200, "all_content": output_bytes, "content": output_bytes })
def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query)
def get(*args, **kwargs): body = kwargs.get("data") if not body: return wrap({"status_code": 400}) text = body.decode('utf8') data = json2value(text) result = jx.run(data) output_bytes = value2json(result).encode('utf8') return wrap({ "status_code": 200, "all_content": output_bytes, "content": output_bytes })
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": ListContainer(".", self.cluster.get_aliases()), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def test_deep_value_selector(self): data = [{'bug_id': 35, 'blocked': [686525, 123456]}] result = jx.run({ "from": { "from": data, "path": "blocked" }, "where": {"exists": {"field": "blocked"}}, "select": [ "blocked", "bug_id" ] }).data assert result[0].blocked == 686525 assert result[1].blocked == 123456
def query(self, _query): try: query = QueryOp.wrap(_query, container=self, namespace=self.namespace) self.stats.record(query) for s in listwrap(query.select): if s.aggregate != None and not aggregates.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is" " not a recognized aggregate", name=s.name, aggregate=s.aggregate, ) frum = query["from"] if is_op(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_bulk_agg(self.es, query): return es_bulkaggsop(self, frum, query) if is_bulk_set(self.es, query): return es_bulksetop(self, frum, query) query.limit = temper_limit(query.limit, query) if is_aggsop(self.es, query): return es_aggsop(self.es, frum, query) if is_setop(self.es, query): return es_setop(self.es, query) Log.error("Can not handle") except Exception as cause: cause = Except.wrap(cause) if "Data too large, data for" in cause: http.post(self.es.cluster.url / "_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", cause) Log.error("problem", cause=cause)
def query(self, query): query.frum = self.__iter__() output = jx.run(query) return output
def pricing(self): with self.price_locker: if self.prices: return self.prices prices = self._get_spot_prices_from_aws() now = Date.now() with Timer("processing pricing data"): hourly_pricing = jx.run({ "from": { # AWS PRICING ONLY SENDS timestamp OF CHANGES, MATCH WITH NEXT INSTANCE "from": prices, "window": [ { "name": "expire", "value": { "coalesce": [{ "rows": { "timestamp": 1 } }, { "date": "eod" }] }, "edges": ["availability_zone", "instance_type"], "sort": "timestamp" }, { # MAKE THIS PRICE EFFECTIVE INTO THE PAST, THIS HELPS SPREAD PRICE SPIKES OVER TIME "name": "effective", "value": { "sub": { "timestamp": self.settings.uptime.duration.seconds } } } ] }, "edges": [ "availability_zone", "instance_type", { "name": "time", "range": { "min": "effective", "max": "expire", "mode": "inclusive" }, "allowNulls": False, "domain": { "type": "time", "min": now.floor(HOUR) - self.settings.uptime.history, "max": Date.now().floor(HOUR) + HOUR, "interval": "hour" } } ], "select": [{ "value": "price", "aggregate": "max" }, { "aggregate": "count" }], "where": { "gt": { "expire": now.floor(HOUR) - self.settings.uptime.history } }, "window": [{ "name": "current_price", "value": "rows.last.price", "edges": ["availability_zone", "instance_type"], "sort": "time" }] }).data bid80 = jx.run({ "from": ListContainer(name=None, data=hourly_pricing), "edges": [{ "value": "availability_zone", "allowNulls": False }, { "name": "type", "value": "instance_type", "allowNulls": False, "domain": { "type": "set", "key": "instance_type", "partitions": self.settings.utility } }], "select": [{ "name": "price_80", "value": "price", "aggregate": "percentile", "percentile": self.settings.uptime.bid_percentile }, { "name": "max_price", "value": "price", "aggregate": "max" }, { "aggregate": "count" }, { "value": "current_price", "aggregate": "one" }, { "name": "all_price", "value": "price", "aggregate": "list" }], "window": [ { "name": "estimated_value", "value": { "div": ["type.utility", "price_80"] } }, { "name": "higher_price", "value": lambda row, rownum, rows: find_higher( row.all_price, row.price_80) } # TODO: SUPPORT {"from":"all_price", "where":{"gt":[".", "price_80"]}, "select":{"aggregate":"min"}} ] }) output = jx.sort(bid80.values(), { "value": "estimated_value", "sort": -1 }) self.prices = wrap(output) self.price_lookup = UniqueIndex( ("type.instance_type", "availability_zone"), data=self.prices) return self.prices
def _get_spot_prices_from_aws(self): with Timer("Read no capacity file"): try: # FILE IS LIST OF {instance_type, last_failure} OBJECTS content = self.no_capacity_file.read() self.no_capacity = dict( (r.instance_type, r.last_failure) for r in convert.json2value( content, flexible=False, leaves=False)) except Exception as e: self.no_capacity = {} with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() cache = ListContainer(name=None, data=cache) most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": { "value": "timestamp", "aggregate": "max" } }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX( [Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note( "get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce( self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token) next_token = resultset.next_token for p in resultset: prices.add( wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter( prices, {"gte": { "timestamp": { "date": "today-2day" } }}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return ListContainer(name="prices", data=prices)
def jx_query(path): try: with Timer("total duration", verbose=DEBUG) as query_timer: preamble_timer = Timer("preamble", silent=True) with preamble_timer: if flask.request.headers.get("content-length", "") in ["", "0"]: # ASSUME A BROWSER HIT THIS POINT, SEND text/html RESPONSE BACK return Response( BLANK, status=400, headers={ "Content-Type": "text/html" } ) elif int(flask.request.headers["content-length"]) > QUERY_SIZE_LIMIT: Log.error(QUERY_TOO_LARGE) request_body = flask.request.get_data().strip() text = request_body.decode('utf8') data = json2value(text) record_request(flask.request, data, None, None) if data.meta.testing: test_mode_wait(data, MAIN_THREAD.please_stop) find_table_timer = Timer("find container", verbose=DEBUG) with find_table_timer: frum = find_container(data['from'], after=None) translate_timer = Timer("translate", verbose=DEBUG) with translate_timer: result = jx.run(data, container=frum) if isinstance(result, Container): # TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format(data.format) save_timer = Timer("save", verbose=DEBUG) with save_timer: if data.meta.save: try: result.meta.saved_as = save_query.query_finder.save(data) except Exception as e: Log.warning("Unexpected save problem", cause=e) result.meta.timing.find_table = mo_math.round(find_table_timer.duration.seconds, digits=4) result.meta.timing.preamble = mo_math.round(preamble_timer.duration.seconds, digits=4) result.meta.timing.translate = mo_math.round(translate_timer.duration.seconds, digits=4) result.meta.timing.save = mo_math.round(save_timer.duration.seconds, digits=4) result.meta.timing.total = "{{TOTAL_TIME}}" # TIMING PLACEHOLDER with Timer("jsonification", verbose=DEBUG) as json_timer: response_data = value2json(result).encode('utf8') with Timer("post timer", verbose=DEBUG): # IMPORTANT: WE WANT TO TIME OF THE JSON SERIALIZATION, AND HAVE IT IN THE JSON ITSELF. # WE CHEAT BY DOING A (HOPEFULLY FAST) STRING REPLACEMENT AT THE VERY END timing_replacement = ( b'"total":' + binary_type(mo_math.round(query_timer.duration.seconds, digits=4)) + b', "jsonification":' + binary_type(mo_math.round(json_timer.duration.seconds, digits=4)) ) response_data = response_data.replace(b'"total":"{{TOTAL_TIME}}"', timing_replacement) Log.note("Response is {{num}} bytes in {{duration}}", num=len(response_data), duration=query_timer.duration) return Response( response_data, status=200, headers={ "Content-Type": result.meta.content_type } ) except Exception as e: e = Except.wrap(e) return send_error(query_timer, request_body, e)
def sql_query(path): with RegisterThread(): query_timer = Timer("total duration") request_body = None try: with query_timer: preamble_timer = Timer("preamble", silent=True) with preamble_timer: if flask.request.headers.get("content-length", "") in ["", "0"]: # ASSUME A BROWSER HIT THIS POINT, SEND text/html RESPONSE BACK return Response(BLANK, status=400, headers={"Content-Type": "text/html"}) elif int(flask.request.headers["content-length"] ) > QUERY_SIZE_LIMIT: Log.error("Query is too large") request_body = flask.request.get_data().strip() text = utf82unicode(request_body) data = json2value(text) record_request(flask.request, data, None, None) translate_timer = Timer("translate", silent=True) with translate_timer: if not data.sql: Log.error("Expecting a `sql` parameter") jx_query = parse_sql(data.sql) frum = find_container(jx_query['from']) if data.meta.testing: test_mode_wait(jx_query) result = jx.run(jx_query, container=frum) if isinstance( result, Container ): # TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format(jx_query.format) result.meta.jx_query = jx_query save_timer = Timer("save") with save_timer: if data.meta.save: try: result.meta.saved_as = save_query.query_finder.save( data) except Exception as e: Log.warning("Unexpected save problem", cause=e) result.meta.timing.preamble = mo_math.round( preamble_timer.duration.seconds, digits=4) result.meta.timing.translate = mo_math.round( translate_timer.duration.seconds, digits=4) result.meta.timing.save = mo_math.round( save_timer.duration.seconds, digits=4) result.meta.timing.total = "{{TOTAL_TIME}}" # TIMING PLACEHOLDER with Timer("jsonification", silent=True) as json_timer: response_data = unicode2utf8(value2json(result)) with Timer("post timer", silent=True): # IMPORTANT: WE WANT TO TIME OF THE JSON SERIALIZATION, AND HAVE IT IN THE JSON ITSELF. # WE CHEAT BY DOING A (HOPEFULLY FAST) STRING REPLACEMENT AT THE VERY END timing_replacement = b'"total": ' + str(mo_math.round(query_timer.duration.seconds, digits=4)) +\ b', "jsonification": ' + str(mo_math.round(json_timer.duration.seconds, digits=4)) response_data = response_data.replace( b'"total":"{{TOTAL_TIME}}"', timing_replacement) Log.note("Response is {{num}} bytes in {{duration}}", num=len(response_data), duration=query_timer.duration) return Response( response_data, status=200, headers={"Content-Type": result.meta.content_type}) except Exception as e: e = Except.wrap(e) return send_error(query_timer, request_body, e)