Exemplo n.º 1
0
 def test_big_result_works(self):
     result = http.post_json(global_settings.testing.query,
                             data={
                                 "from": "unittest",
                                 "where": {
                                     "and": [{
                                         "gte": {
                                             "run.timestamp":
                                             Date.today() - DAY
                                         }
                                     }, {
                                         "lt": {
                                             "run.timestamp": Date.today()
                                         }
                                     }, {
                                         "eq": {
                                             "result.ok": False
                                         }
                                     }]
                                 },
                                 "format": "list",
                                 "limit": 10000
                             })
     if result.template:
         result = Except.new_instance(result)
         Log.error("problem with call", cause=result)
     Log.note("Got {{num}} test failures", num=len(result.data))
Exemplo n.º 2
0
    def _delete_old_indexes(self, candidates):
        for c in candidates:
            timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
            if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
                # Log.warning("Will delete {{index}}", index=c.index)
                try:
                    self.cluster.delete_index(c.index)
                except Exception as e:
                    Log.warning("could not delete index {{index}}", index=c.index, cause=e)
        for t, q in list(self.known_queues.items()):
            if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max:
                with self.locker:
                    del self.known_queues[t]

        pass
Exemplo n.º 3
0
    def _delete_old_indexes(self, candidates):
        for c in candidates:
            timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
            if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
                # Log.warning("Will delete {{index}}", index=c.index)
                try:
                    self.cluster.delete_index(c.index)
                except Exception as e:
                    Log.warning("could not delete index {{index}}", index=c.index, cause=e)
        for t, q in items(self.known_queues):
            if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max:
                with self.locker:
                    del self.known_queues[t]

        pass
Exemplo n.º 4
0
    def test_two_simple(self):
        today = Date.today()

        result = replace_vars('"{{today|week}}" "{{today}}d"')
        expect = '"' + unicode(today.floor(WEEK).unix) + '" "' + unicode(
            today.unix) + 'd"'
        self.assertEqual(result, expect)
Exemplo n.º 5
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = sort_using_key(
                filter(
                    lambda r: re.match(
                        re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$",
                        r['index']
                    ),
                    self.cluster.get_aliases()
                ),
                key=lambda r: r['index']
            )
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            def refresh(please_stop):
                try:
                    es.set_refresh_interval(seconds=60 * 10, timeout=5)
                except Exception:
                    Log.note("Could not set refresh interval for {{index}}", index=es.settings.index)

            Thread.run("refresh", refresh)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Exemplo n.º 6
0
 def _delete_old_indexes(self, candidates):
     for c in candidates:
         timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
         if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
             # Log.warning("Will delete {{index}}", index=c.index)
             try:
                 self.cluster.delete_index(c.index)
             except Exception, e:
                 Log.warning("could not delete index {{index}}", index=c.index, cause=e)
Exemplo n.º 7
0
    def test_overload(self):
        today = Date.today()

        result = replace_vars('"{{today|week}}" "{{var}}"', {
            "today": 1000,
            "var": 20
        })
        expect = '"' + unicode(today.floor(WEEK).unix) + '" "20"'
        self.assertEqual(result, expect)
Exemplo n.º 8
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = jx.run({
                "from": ListContainer(".", self.cluster.get_aliases()),
                "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}},
                "sort": "index"
            })
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            with suppress_exception:
                es.set_refresh_interval(seconds=60 * 5, timeout=5)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Exemplo n.º 9
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = jx.run({
                "from": ListContainer(".", self.cluster.get_aliases()),
                "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}},
                "sort": "index"
            })
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            with suppress_exception:
                es.set_refresh_interval(seconds=60 * 5, timeout=5)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Exemplo n.º 10
0
 def test_last_week(self):
     self.assertAlmostEqual(
         parse("today-7day").unix, (Date.today() - DAY * 7).unix)
Exemplo n.º 11
0
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski ([email protected])
#

from __future__ import absolute_import, division, unicode_literals

from jx_base.expressions import NULL
from jx_base.query import DEFAULT_LIMIT
from mo_dots import wrap
from mo_logs import Log
from mo_times.dates import Date
from mo_times.durations import DAY, WEEK
from tests.test_jx import BaseTestCase, TEST_TABLE

TODAY = Date.today()

test_data_1 = [{
    "a": "x",
    "t": Date("today").unix,
    "v": 2
}, {
    "a": "x",
    "t": Date("today-day").unix,
    "v": 2
}, {
    "a": "x",
    "t": Date("today-2day").unix,
    "v": 3
}, {
    "a": "x",
Exemplo n.º 12
0
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski ([email protected])
#

from __future__ import division
from __future__ import unicode_literals

from mo_times.dates import Date
from mo_times.durations import DAY

from tests.test_jx import BaseTestCase, TEST_TABLE

FROM_DATE = Date.today() - 7 * DAY
TO_DATE = Date.today()

simple_test_data = [{
    "run": {
        "timestamp": Date("now-4day"),
        "value": 1
    }
}, {
    "run": {
        "timestamp": Date("now-4day"),
        "value": 2
    }
}, {
    "run": {
        "timestamp": Date("now-4day"),
Exemplo n.º 13
0
    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = Date(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = Date.today(
                ) - YEAR + DAY  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                get_ids = SQL(
                    "SELECT s.id " +
                    "\nFROM treeherder.performance_alert_summary s" +
                    "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id"
                    + "\nWHERE s.created>" + quote_value(last_year).sql +
                    " AND (s.last_updated > " +
                    quote_value(last_modified).sql + "\nOR a.last_updated > " +
                    quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" +
                    "\nORDER BY s.id" + "\nLIMIT " +
                    quote_value(settings.extractor.chunk_size).sql)
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()
Exemplo n.º 14
0
 def test_end_of_month(self):
     self.assertAlmostEqual(
         parse("today|month+month").unix,
         Date.today().floor(MONTH).add(MONTH).unix)
Exemplo n.º 15
0
 def test_beginning_of_month(self):
     self.assertAlmostEqual(
         parse("today|month").unix,
         Date.today().floor(MONTH).unix)
Exemplo n.º 16
0
 def test_last_year(self):
     self.assertAlmostEqual(
         parse("today-12month").unix, (Date.today() - MONTH * 12).unix)
Exemplo n.º 17
0
 def test_week_before(self):
     self.assertAlmostEqual(
         parse("today-2week").unix, (Date.today() - WEEK * 2).unix)
Exemplo n.º 18
0
 def test_yesterday(self):
     self.assertAlmostEqual(
         parse("today-day").unix, (Date.today() - DAY).unix)
Exemplo n.º 19
0
 def test_today(self):
     self.assertAlmostEqual(parse("today").unix, Date.today().unix)
Exemplo n.º 20
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = parse(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = (
                    Date.today() - YEAR + DAY
                )  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                # SELECT
                #     s.od
                # FROM
                #     treeherder.performance_alert_summary s
                # LEFT JOIN
                #     treeherder.performance_alert a ON s.id=a.summary_id
                # WHERE
                #     s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified})
                # GROUP BY
                #     s.id
                # ORDER BY
                #     s.id
                # LIMIT
                #     {settings.extractor.chunk_size}
                get_ids = SQL(
                    str(
                        (
                            PerformanceAlertSummary.objects.filter(
                                Q(created__gt=last_year.datetime)
                                & (
                                    Q(last_updated__gt=last_modified.datetime)
                                    | Q(alerts__last_updated__gt=last_modified.datetime)
                                )
                            )
                            .annotate()
                            .values("id")
                            .order_by("id")[: settings.extractor.chunk_size]
                        ).query
                    )
                )

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()
Exemplo n.º 21
0
 def test_next_week(self):
     self.assertAlmostEqual(
         parse("today+7day").unix, (Date.today() + DAY * 7).unix)
Exemplo n.º 22
0
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue

    def _delete_old_indexes(self, candidates):
        for c in candidates:
            timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
            if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
                # Log.warning("Will delete {{index}}", index=c.index)
                try:
                    self.cluster.delete_index(c.index)
                except Exception, e:
                    Log.warning("could not delete index {{index}}", index=c.index, cause=e)
        for t, q in list(self.known_queues.items()):
            if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max:
                with self.locker:
                    del self.known_queues[t]

        pass

    # ADD keys() SO ETL LOOP CAN FIND WHAT'S GETTING REPLACED
    def keys(self, prefix=None):
        path = jx.reverse(etl2path(key2etl(prefix)))

        result = self.es.search({
            "fields": ["_id"],
            "query": {
                "filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [{"term": {"etl" + (".source" * i) + ".id": v}} for i, v in enumerate(path)]}