def test_big_result_works(self): result = http.post_json(global_settings.testing.query, data={ "from": "unittest", "where": { "and": [{ "gte": { "run.timestamp": Date.today() - DAY } }, { "lt": { "run.timestamp": Date.today() } }, { "eq": { "result.ok": False } }] }, "format": "list", "limit": 10000 }) if result.template: result = Except.new_instance(result) Log.error("problem with call", cause=result) Log.note("Got {{num}} test failures", num=len(result.data))
def _delete_old_indexes(self, candidates): for c in candidates: timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S") if timestamp + self.rollover_interval < Date.today() - self.rollover_max: # Log.warning("Will delete {{index}}", index=c.index) try: self.cluster.delete_index(c.index) except Exception as e: Log.warning("could not delete index {{index}}", index=c.index, cause=e) for t, q in list(self.known_queues.items()): if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max: with self.locker: del self.known_queues[t] pass
def _delete_old_indexes(self, candidates): for c in candidates: timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S") if timestamp + self.rollover_interval < Date.today() - self.rollover_max: # Log.warning("Will delete {{index}}", index=c.index) try: self.cluster.delete_index(c.index) except Exception as e: Log.warning("could not delete index {{index}}", index=c.index, cause=e) for t, q in items(self.known_queues): if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max: with self.locker: del self.known_queues[t] pass
def test_two_simple(self): today = Date.today() result = replace_vars('"{{today|week}}" "{{today}}d"') expect = '"' + unicode(today.floor(WEEK).unix) + '" "' + unicode( today.unix) + 'd"' self.assertEqual(result, expect)
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = sort_using_key( filter( lambda r: re.match( re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$", r['index'] ), self.cluster.get_aliases() ), key=lambda r: r['index'] ) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) def refresh(please_stop): try: es.set_refresh_interval(seconds=60 * 10, timeout=5) except Exception: Log.note("Could not set refresh interval for {{index}}", index=es.settings.index) Thread.run("refresh", refresh) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def _delete_old_indexes(self, candidates): for c in candidates: timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S") if timestamp + self.rollover_interval < Date.today() - self.rollover_max: # Log.warning("Will delete {{index}}", index=c.index) try: self.cluster.delete_index(c.index) except Exception, e: Log.warning("could not delete index {{index}}", index=c.index, cause=e)
def test_overload(self): today = Date.today() result = replace_vars('"{{today|week}}" "{{var}}"', { "today": 1000, "var": 20 }) expect = '"' + unicode(today.floor(WEEK).unix) + '" "20"' self.assertEqual(result, expect)
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": ListContainer(".", self.cluster.get_aliases()), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": ListContainer(".", self.cluster.get_aliases()), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def test_last_week(self): self.assertAlmostEqual( parse("today-7day").unix, (Date.today() - DAY * 7).unix)
# You can obtain one at http://mozilla.org/MPL/2.0/. # # Author: Kyle Lahnakoski ([email protected]) # from __future__ import absolute_import, division, unicode_literals from jx_base.expressions import NULL from jx_base.query import DEFAULT_LIMIT from mo_dots import wrap from mo_logs import Log from mo_times.dates import Date from mo_times.durations import DAY, WEEK from tests.test_jx import BaseTestCase, TEST_TABLE TODAY = Date.today() test_data_1 = [{ "a": "x", "t": Date("today").unix, "v": 2 }, { "a": "x", "t": Date("today-day").unix, "v": 2 }, { "a": "x", "t": Date("today-2day").unix, "v": 3 }, { "a": "x",
# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. # # Author: Kyle Lahnakoski ([email protected]) # from __future__ import division from __future__ import unicode_literals from mo_times.dates import Date from mo_times.durations import DAY from tests.test_jx import BaseTestCase, TEST_TABLE FROM_DATE = Date.today() - 7 * DAY TO_DATE = Date.today() simple_test_data = [{ "run": { "timestamp": Date("now-4day"), "value": 1 } }, { "run": { "timestamp": Date("now-4day"), "value": 2 } }, { "run": { "timestamp": Date("now-4day"),
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = Date(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = Date.today( ) - YEAR + DAY # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY get_ids = SQL( "SELECT s.id " + "\nFROM treeherder.performance_alert_summary s" + "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id" + "\nWHERE s.created>" + quote_value(last_year).sql + " AND (s.last_updated > " + quote_value(last_modified).sql + "\nOR a.last_updated > " + quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" + "\nORDER BY s.id" + "\nLIMIT " + quote_value(settings.extractor.chunk_size).sql) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def test_end_of_month(self): self.assertAlmostEqual( parse("today|month+month").unix, Date.today().floor(MONTH).add(MONTH).unix)
def test_beginning_of_month(self): self.assertAlmostEqual( parse("today|month").unix, Date.today().floor(MONTH).unix)
def test_last_year(self): self.assertAlmostEqual( parse("today-12month").unix, (Date.today() - MONTH * 12).unix)
def test_week_before(self): self.assertAlmostEqual( parse("today-2week").unix, (Date.today() - WEEK * 2).unix)
def test_yesterday(self): self.assertAlmostEqual( parse("today-day").unix, (Date.today() - DAY).unix)
def test_today(self): self.assertAlmostEqual(parse("today").unix, Date.today().unix)
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = parse(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = ( Date.today() - YEAR + DAY ) # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY # SELECT # s.od # FROM # treeherder.performance_alert_summary s # LEFT JOIN # treeherder.performance_alert a ON s.id=a.summary_id # WHERE # s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified}) # GROUP BY # s.id # ORDER BY # s.id # LIMIT # {settings.extractor.chunk_size} get_ids = SQL( str( ( PerformanceAlertSummary.objects.filter( Q(created__gt=last_year.datetime) & ( Q(last_updated__gt=last_modified.datetime) | Q(alerts__last_updated__gt=last_modified.datetime) ) ) .annotate() .values("id") .order_by("id")[: settings.extractor.chunk_size] ).query ) ) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def test_next_week(self): self.assertAlmostEqual( parse("today+7day").unix, (Date.today() + DAY * 7).unix)
threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue def _delete_old_indexes(self, candidates): for c in candidates: timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S") if timestamp + self.rollover_interval < Date.today() - self.rollover_max: # Log.warning("Will delete {{index}}", index=c.index) try: self.cluster.delete_index(c.index) except Exception, e: Log.warning("could not delete index {{index}}", index=c.index, cause=e) for t, q in list(self.known_queues.items()): if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max: with self.locker: del self.known_queues[t] pass # ADD keys() SO ETL LOOP CAN FIND WHAT'S GETTING REPLACED def keys(self, prefix=None): path = jx.reverse(etl2path(key2etl(prefix))) result = self.es.search({ "fields": ["_id"], "query": { "filtered": { "query": {"match_all": {}}, "filter": {"and": [{"term": {"etl" + (".source" * i) + ".id": v}} for i, v in enumerate(path)]}