Exemplo n.º 1
0
def extract_alert_settings(env_setup):
    settings = startup.read_settings(filename=extract_alerts.CONFIG_FILE,
                                     complain=False)
    settings.source.database.ssl = None  # NOT REQUIRED FOR TEST DATABASE
    constants.set(settings.constants)
    Log.start(settings.debug)
    return settings
Exemplo n.º 2
0
 def setUpClass(cls):
     try:
         cls.config = startup.read_settings(filename="tests/config.json")
         constants.set(cls.config.constants)
         Log.start(cls.config.debug)
     except Exception as e:
         Log.error("Problem with etl", e)
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        with SingleInstance(flavor_id=settings.args.filename):
            settings.run_interval = Duration(settings.run_interval)
            for u in settings.utility:
                u.discount = coalesce(u.discount, 0)
                # MARKUP drives WITH EXPECTED device MAPPING
                num_ephemeral_volumes = ephemeral_storage[
                    u.instance_type]["num"]
                for i, d in enumerate(d for d in u.drives if not d.device):
                    letter = convert.ascii2char(98 + num_ephemeral_volumes + i)
                    d.device = "/dev/xvd" + letter

            settings.utility = UniqueIndex(["instance_type"],
                                           data=settings.utility)
            instance_manager = new_instance(settings.instance)
            m = SpotManager(instance_manager, kwargs=settings)

            if ENABLE_SIDE_EFFECTS:
                m.update_spot_requests()

            if m.watcher:
                m.watcher.join()
    except Exception as e:
        Log.warning("Problem with spot manager", cause=e)
    finally:
        Log.stop()
        MAIN_THREAD.stop()
Exemplo n.º 4
0
def main():
    try:
        settings = startup.read_settings(
            defs=[{
                "name": ["--all", "-a"],
                "action": 'store_true',
                "help": 'process all mo-* subdirectories',
                "dest": "all",
                "required": False
            }, {
                "name": ["--dir", "--directory", "-d"],
                "help": 'directory to deploy',
                "type": str,
                "dest": "directory",
                "required": True,
                "default": "."
            }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        if settings.args.all:
            deploy_all(File(settings.args.directory), settings.prefix,
                       settings)
        else:
            Deploy(File(settings.args.directory), kwargs=settings).deploy()
    except Exception, e:
        Log.warning("Problem with etl", cause=e)
Exemplo n.º 5
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        with SingleInstance(flavor_id=settings.args.filename):
            constants.set(settings.constants)
            settings.run_interval = Duration(settings.run_interval)
            for u in settings.utility:
                u.discount = coalesce(u.discount, 0)
                # MARKUP drives WITH EXPECTED device MAPPING
                num_ephemeral_volumes = ephemeral_storage[u.instance_type]["num"]
                for i, d in enumerate(d for d in u.drives if not d.device):
                    letter = convert.ascii2char(98 + num_ephemeral_volumes + i)
                    d.device = "/dev/xvd" + letter

            settings.utility = UniqueIndex(["instance_type"], data=settings.utility)
            instance_manager = new_instance(settings.instance)
            m = SpotManager(instance_manager, kwargs=settings)

            if ENABLE_SIDE_EFFECTS:
                m.update_spot_requests(instance_manager.required_utility())

            if m.watcher:
                m.watcher.join()
    except Exception as e:
        Log.warning("Problem with spot manager", cause=e)
    finally:
        Log.stop()
        MAIN_THREAD.stop()
Exemplo n.º 6
0
def main():
    try:
        settings = startup.read_settings()
        with startup.SingleInstance(settings.args.filename):
            constants.set(settings.constants)
            Log.start(settings.debug)

            extractor = Extract(settings)

            def extract(please_stop):
                with MySQL(**settings.snowflake.database) as db:
                    with db.transaction():
                        for kwargs in extractor.queue:
                            if please_stop:
                                break
                            try:
                                extractor.extract(db=db,
                                                  please_stop=please_stop,
                                                  **kwargs)
                            except Exception as e:
                                Log.warning("Could not extract", cause=e)
                                extractor.queue.add(kwargs)

            for i in range(settings.extract.threads):
                Thread.run("extract #" + text_type(i), extract)

            please_stop = Signal()
            Thread.wait_for_shutdown_signal(please_stop=please_stop,
                                            allow_exit=True,
                                            wait_forever=False)
    except Exception as e:
        Log.warning("Problem with data extraction", e)
    finally:
        Log.stop()
Exemplo n.º 7
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        please_stop = Signal("main stop signal")
        Thread.wait_for_shutdown_signal(please_stop)
    except Exception, e:
        Log.error("Problem with etl", cause=e)
Exemplo n.º 8
0
def setup(settings=None):
    global config

    try:
        config = startup.read_settings(defs={
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        },
                                       filename=settings)
        constants.set(config.constants)
        Log.start(config.debug)

        if config.args.process_num and config.flask.port:
            config.flask.port += config.args.process_num

        # PIPE REQUEST LOGS TO ES DEBUG
        if config.request_logs:
            request_logger = elasticsearch.Cluster(
                config.request_logs).get_or_create_index(config.request_logs)
            active_data.request_log_queue = request_logger.threaded_queue(
                max_size=2000)

        # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
        containers.config.default = {
            "type": "elasticsearch",
            "settings": config.elasticsearch.copy()
        }

        # TURN ON /exit FOR WINDOWS DEBUGGING
        if config.flask.debug or config.flask.allow_exit:
            config.flask.allow_exit = None
            Log.warning("ActiveData is in debug mode")
            app.add_url_rule('/exit', 'exit', _exit)

        # TRIGGER FIRST INSTANCE
        FromESMetadata(config.elasticsearch)
        if config.saved_queries:
            setattr(save_query, "query_finder",
                    SaveQueries(config.saved_queries))
        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        if config.flask.ssl_context:
            if config.args.process_num:
                Log.error(
                    "can not serve ssl and multiple Flask instances at once")
            setup_ssl()

        return app
    except Exception, e:
        Log.error(
            "Serious problem with ActiveData service construction!  Shutdown!",
            cause=e)
Exemplo n.º 9
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        ETL(settings).setup(settings.instance, settings.utility)
    except Exception as e:
        Log.warning("Problem with setup of ETL", cause=e)
    finally:
        Log.stop()
Exemplo n.º 10
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        ETL(settings).setup(settings.instance, settings.utility)
    except Exception as e:
        Log.warning("Problem with setup of ETL", cause=e)
    finally:
        Log.stop()
Exemplo n.º 11
0
def main():
    settings = startup.read_settings()
    Log.start(settings.debug)
    constants.set(settings.constants)

    try:
        _synch(settings)
    except Exception as e:
        Log.error("Problem with synch", e)
    finally:
        Log.stop()
Exemplo n.º 12
0
def main():
    global config
    global hg
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        hg = HgMozillaOrg(config)
        Log.start(config.debug)

    except Exception as e:
        Log.error("Problem with etl", e)
Exemplo n.º 13
0
    def setUpClass(cls):
        global config, broker
        try:
            config = startup.read_settings(filename="tests/config/file.json")
            constants.set(config.constants)
            Log.start(config.debug)

            File(config.broker.backing.directory).delete()
            broker = Broker(kwargs=config.broker)
        except Exception as e:
            Log.error("could not setup for testing", cause=e)
Exemplo n.º 14
0
    def run(self, force=False, restart=False, start=None, merge=False):
        try:
            # SETUP LOGGING
            settings = startup.read_settings(filename=CONFIG_FILE)
            constants.set(settings.constants)
            Log.start(settings.debug)

            self.extract(settings, force, restart, start, merge)
        except Exception as e:
            Log.error("could not extract jobs", cause=e)
        finally:
            Log.stop()
Exemplo n.º 15
0
def main():
    global config
    global hg
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        hg = HgMozillaOrg(config)
        random = _parse_diff(
            Data(changeset={"id": "2d9d0bebb5c6"},
                 branch={"url": "https://hg.mozilla.org/mozilla-central"}))
    except Exception as e:
        Log.error("Problem with etl", e)
Exemplo n.º 16
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        inject_secrets(config)

        with Timer("PATCH ADR: dd update() method to Configuration class"):

            def update(self, config):
                """
                Update the configuration object with new parameters
                :param config: dict of configuration
                """
                for k, v in config.items():
                    if v != None:
                        self._config[k] = v

                self._config["sources"] = sorted(
                    map(os.path.expanduser, set(self._config["sources"]))
                )

                # Use the NullStore by default. This allows us to control whether
                # caching is enabled or not at runtime.
                self._config["cache"].setdefault("stores", {"null": {"driver": "null"}})
                object.__setattr__(self, "cache", CacheManager(self._config["cache"]))
                self.cache.extend("null", lambda driver: NullStore())

            setattr(Configuration, "update", update)

        # UPDATE ADR COFIGURATION
        adr.config.update(config.adr)

        Log.start(config.debug)

        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        loguru.logger.remove()
        loguru.logger.add(
            _logging, level="DEBUG", format="{message}", filter=lambda r: True,
        )

        Schedulers(config).process()
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Exemplo n.º 17
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception as e:
        Log.error("Problem with etl", e)
    finally:
        Log.stop()
Exemplo n.º 18
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception as e:
        Log.error("Problem with etl", e)
    finally:
        Log.stop()
Exemplo n.º 19
0
def extract_job_settings():
    # These values not directly accessed during testing, but the code requires that they be present.
    os.environ["NEW_RELIC_APP_NAME"] = "testing"
    os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1"
    os.environ["BIGQUERY_PRIVATE_KEY"] = "1"

    # USE THE TEST SCHEMA
    db_url = os.environ["DATABASE_URL"]
    db_url = db_url.replace(strings.between(db_url, "/", None),
                            DATABASES["default"]["TEST"]["NAME"])
    os.environ["DATABASE_URL"] = db_url

    settings = startup.read_settings(filename=extract_jobs.CONFIG_FILE,
                                     complain=False)
    settings.source.database.ssl = None  # NOT REQUIRED FOR TEST DATABASE
    constants.set(settings.constants)
    Log.start(settings.debug)
    return settings
Exemplo n.º 20
0
def setup():
    global config

    config = startup.read_settings(
        filename=os.environ.get('ACTIVEDATA_CONFIG'),
        defs=[
            {
                "name": ["--process_num", "--process"],
                "help": "Additional port offset (for multiple Flask processes",
                "type": int,
                "dest": "process_num",
                "default": 0,
                "required": False
            }
        ]
    )

    constants.set(config.constants)
    Log.start(config.debug)

    # PIPE REQUEST LOGS TO ES DEBUG
    if config.request_logs:
        cluster = elasticsearch.Cluster(config.request_logs)
        request_logger = cluster.get_or_create_index(config.request_logs)
        active_data.request_log_queue = request_logger.threaded_queue(max_size=2000)

    if config.dockerflow:
        def backend_check():
            http.get_json(config.elasticsearch.host + ":" + text_type(config.elasticsearch.port))
        dockerflow(flask_app, backend_check)


    # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
    container.config.default = {
        "type": "elasticsearch",
        "settings": config.elasticsearch.copy()
    }

    # TRIGGER FIRST INSTANCE
    if config.saved_queries:
        setattr(save_query, "query_finder", SaveQueries(config.saved_queries))

    HeaderRewriterFix(flask_app, remove_headers=['Date', 'Server'])
Exemplo n.º 21
0
def main():
    try:
        config = startup.read_settings(defs=[{
            "name": ["--file"],
            "help": "file to save backup",
            "type": str,
            "dest": "file",
            "required": True
        }])
        constants.set(config.constants)
        Log.start(config.debug)

        sq = elasticsearch.Index(kwargs=config.saved_queries)
        result = sq.search({"query": {"match_all": {}}, "size": 200000})

        File(config.args.file).write("".join(
            map(convert.json2value, result.hits.hits)))

    except Exception, e:
        Log.error("Problem with etl", e)
Exemplo n.º 22
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        hg = HgMozillaOrg(settings)
        todo = Queue()
        todo.add("97160a734959")
        least = 100000

        while todo:
            next_ = todo.pop()
            curr = hg.get_revision(
                wrap({
                    "changeset": {
                        "id": next_
                    },
                    "branch": {
                        "name": BRANCH
                    }
                }))
            if len(curr.changeset.files) > MIN_FILES:
                diff = hg._get_json_diff_from_hg(curr)
                num_changes = sum(len(d.changes) for d in diff)
                score = num_changes / len(diff)
                if score < least:
                    least = score
                    Log.note(
                        "smallest = {{rev}}, num_lines={{num}}, num_files={{files}}",
                        rev=curr.changeset.id,
                        num=num_changes,
                        files=len(diff))
            todo.extend(listwrap(curr.parents))

    except Exception as e:
        Log.error("Problem with scna", e)
    finally:
        Log.stop()
Exemplo n.º 23
0
 config = startup.read_settings([
     {
         "name": ["--id", "--key", "--ids", "--keys"],
         "dest": "id",
         "nargs": "*",
         "type": int,
         "help": "show specific signatures",
     },
     {
         "name": "--download",
         "dest": "download",
         "help": "download deviance to CSV local file",
         "nargs": "?",
         "const": "deviant_stats.csv",
         "type": str,
         "action": "store",
     },
     {
         "name": ["--dev", "--deviant", "--deviance"],
         "dest": "deviant",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top deviant series",
         "action": "store",
     },
     {
         "name": ["--modal"],
         "dest": "modal",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top modal series",
         "action": "store",
     },
     {
         "name": ["--outliers"],
         "dest": "outliers",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top outliers series",
         "action": "store",
     },
     {
         "name": ["--skewed", "--skew"],
         "dest": "skewed",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top skewed series",
         "action": "store",
     },
     {
         "name": ["--ok"],
         "dest": "ok",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top worst OK series",
         "action": "store",
     },
     {
         "name": ["--noise", "--noisy"],
         "dest": "noise",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of top noisiest series",
         "action": "store",
     },
     {
         "name": ["--extra", "-e"],
         "dest": "extra",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help": "show number of series that are missing perfherder alerts",
         "action": "store",
     },
     {
         "name": ["--missing", "--miss", "-m"],
         "dest": "missing",
         "nargs": "?",
         "const": 10,
         "type": int,
         "help":
         "show number of series which are missing alerts over perfherder",
         "action": "store",
     },
     {
         "name": ["--pathological", "--pathological", "--pathology", "-p"],
         "dest": "pathological",
         "nargs": "?",
         "const": 3,
         "type": int,
         "help": "show number of series that have most edges",
         "action": "store",
     },
 ])
Exemplo n.º 24
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                # get_ids = sql_query(
                #     {
                #         "from": "job",
                #         "select": ["id"],
                #         "where": {
                #             "or": [
                #                 {"gt": {"last_modified": parse(last_modified)}},
                #                 {
                #                     "and": [
                #                         {"eq": {"last_modified": parse(last_modified)}},
                #                         {"gt": {"id": job_id}},
                #                     ]
                #                 },
                #             ]
                #         },
                #         "sort": ["last_modified", "id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )

                get_ids = SQL(str(
                    (
                        Job.objects.filter(
                            Q(last_modified__gt=parse(last_modified).datetime)
                            | (
                                Q(last_modified=parse(last_modified).datetime)
                                & Q(id__gt=job_id)
                            )
                        )
                        .annotate()
                        .values("id")
                        .order_by("last_modified", "id")[
                            : settings.extractor.chunk_size
                        ]
                    ).query
                ))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Exemplo n.º 25
0
if __name__ in ("__main__", ):
    Log.note("Starting TUID Service App...")
    flask_app = TUIDApp(__name__)
    flask_app.add_url_rule(str('/'),
                           None,
                           tuid_endpoint,
                           defaults={'path': ''},
                           methods=[str('GET'), str('POST')])
    flask_app.add_url_rule(str('/<path:path>'),
                           None,
                           tuid_endpoint,
                           methods=[str('GET'), str('POST')])

    try:
        config = startup.read_settings(filename=os.environ.get('TUID_CONFIG'))
        constants.set(config.constants)
        Log.start(config.debug)

        service = TUIDService(config.tuid)
        Log.note("Started TUID Service")
    except BaseException as e:  # MUST CATCH BaseException BECAUSE argparse LIKES TO EXIT THAT WAY, AND gunicorn WILL NOT REPORT
        try:
            Log.error(
                "Serious problem with TUID service construction!  Shutdown!",
                cause=e)
        finally:
            Log.stop()

    if config.flask:
        if config.flask.port and config.args.process_num:
Exemplo n.º 26
0
Arquivo: app.py Projeto: rv404674/TUID
        headers={
            "Content-Type": "text/html"
        }
    )


if __name__ in ("__main__",):
    Log.note("Starting TUID Service App...")
    flask_app = TUIDApp(__name__)
    flask_app.add_url_rule(str('/'), None, tuid_endpoint, defaults={'path': ''}, methods=[str('GET'), str('POST')])
    flask_app.add_url_rule(str('/<path:path>'), None, tuid_endpoint, methods=[str('GET'), str('POST')])


    try:
        config = startup.read_settings(
            filename=os.environ.get('TUID_CONFIG')
        )
        constants.set(config.constants)
        Log.start(config.debug)

        service = TUIDService(config.tuid)

        # Log memory info while running
        initial_growth = {}
        objgraph.growth(peak_stats={})
        objgraph.growth(peak_stats=initial_growth)
        service.statsdaemon.initial_growth = initial_growth

        Log.note("Started TUID Service")
        Log.note("Current free memory: {{mem}} Mb", mem=service.statsdaemon.get_free_memory())
    except BaseException as e:  # MUST CATCH BaseException BECAUSE argparse LIKES TO EXIT THAT WAY, AND gunicorn WILL NOT REPORT
Exemplo n.º 27
0
 def __enter__(self):
     self.config = config = startup.read_settings()
     from mo_logs import constants
     constants.set(config.constants)
     Log.start(config.debug)
     return config
Exemplo n.º 28
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        # SHUNT PYTHON LOGGING TO MAIN LOGGING
        capture_logging()
        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        capture_loguru()

        if config.taskcluster:
            inject_secrets(config)

        @extend(Configuration)
        def update(self, config):
            """
            Update the configuration object with new parameters
            :param config: dict of configuration
            """
            for k, v in config.items():
                if v != None:
                    self._config[k] = v

            self._config["sources"] = sorted(
                map(os.path.expanduser, set(self._config["sources"])))

            # Use the NullStore by default. This allows us to control whether
            # caching is enabled or not at runtime.
            self._config["cache"].setdefault("stores",
                                             {"null": {
                                                 "driver": "null"
                                             }})
            object.__setattr__(self, "cache", CustomCacheManager(self._config))
            for _, store in self._config["cache"]["stores"].items():
                if store.path and not store.path.endswith("/"):
                    # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY
                    store.path = store.path + "/"

        if SHOW_S3_CACHE_HIT:
            s3_get = S3Store._get

            @extend(S3Store)
            def _get(self, key):
                with Timer("get {{key}} from S3", {"key": key},
                           verbose=False) as timer:
                    output = s3_get(self, key)
                    if output is not None:
                        timer.verbose = True
                    return output

        # UPDATE ADR CONFIGURATION
        with Repeat("waiting for ADR", every="10second"):
            adr.config.update(config.adr)
            # DUMMY TO TRIGGER CACHE
            make_push_objects(from_date=Date.today().format(),
                              to_date=Date.now().format(),
                              branch="autoland")

        outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds())
        outatime.then(lambda: Log.alert("Out of time, exit early"))
        Schedulers(config).process(outatime)
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Exemplo n.º 29
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = 916850000
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            perf_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note("Extracting perfs for perf.id={{perf_id}}",
                         perf_id=perf_id)

                # get_ids = sql_query(
                #     {
                #         "from": "performance_datum",
                #         "select": ["id"],
                #         "where": {"gt": {"id": perf_id}},
                #         "sort": ["id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )
                get_ids = SQL(
                    str((PerformanceDatum.objects.filter(
                        id__gt=perf_id).values("id").order_by("id")
                         [:settings.extractor.chunk_size]).query))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # TODO: Remove me July 2021
                # OLD PERF RECORDS HAVE NO CORRESPONDING JOB
                # ADD job.submit_time FOR PARTITIONING
                for a in acc:
                    if not a.job.submit_time:
                        a.job.submit_time = a.push_timestamp
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                perf_id = last_doc.id
                redis.set(settings.extractor.key,
                          value2json(perf_id).encode("utf8"))

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done perf extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done perf merge")
Exemplo n.º 30
0
#

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

from mo_files import File
from mo_logs import Log, startup, constants
from mo_testing.fuzzytestcase import FuzzyTestCase
from mo_times.timer import Timer

from mo_dots import set_default, wrap, Null
from mysql_to_s3.extract import Extract
from pyLibrary.sql.mysql import MySQL

settings = startup.read_settings(filename="tests/resources/config/test.json")
constants.set(settings.constants)


class TestExtract(FuzzyTestCase):
    @classmethod
    def setUpClass(cls):
        Log.start(settings.debug)
        with Timer("setup database"):
            try:
                with MySQL(schema=None, kwargs=settings.database) as db:
                    db.query("drop database testing")
            except Exception as e:
                if "Can't drop database " in e:
                    pass
                else:
Exemplo n.º 31
0
def config():
    config = startup.read_settings(filename=os.environ.get('TUID_CONFIG'))
    constants.set(config.constants)
    Log.start(config.debug)
    return config
Exemplo n.º 32
0
def add(any_flask_app):
    global cache

    cache = Cache(config.cache)
    any_flask_app.add_url_rule(str("/<path:path>"), None, relay_get, methods=[str("GET")])
    any_flask_app.add_url_rule(str("/<path:path>"), None, relay_post, methods=[str("POST")])
    any_flask_app.add_url_rule(str("/"), None, relay_get, methods=[str("GET")])
    any_flask_app.add_url_rule(str("/"), None, relay_post, methods=[str("POST")])


if __name__ in ("__main__",):
    Log.note("Starting " + APP_NAME + " Service App...")
    flask_app = RelayApp(__name__)

    try:
        config = startup.read_settings(filename=os.environ.get("HG_RELAY_CONFIG"))
        constants.set(config.constants)
        Log.start(config.debug)

        add(flask_app)
        Log.note("Started " + APP_NAME + " Service")
    except BaseException as e:  # MUST CATCH BaseException BECAUSE argparse LIKES TO EXIT THAT WAY, AND gunicorn WILL NOT REPORT
        try:
            Log.error(
                "Serious problem with " + APP_NAME + " service construction!  Shutdown!", cause=e
            )
        finally:
            Log.stop()

    if config.flask:
        if config.flask.port and config.args.process_num:
Exemplo n.º 33
0
            Log.note("Skipping try revision.")
            queue.commit()
            continue

        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request "+text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()


if __name__ == '__main__':
    try:
        tmp_signal = Signal()
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        queue_consumer(kwargs=config, please_stop=tmp_signal)
        worker = Thread.run("sqs consumer", queue_consumer, kwargs=config)
        MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped)
    except BaseException as e:
        Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
Exemplo n.º 34
0
def setup():
    global config

    config = startup.read_settings(
        default_filename=os.environ.get('ACTIVEDATA_CONFIG'),
        defs=[{
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        }])

    constants.set(config.constants)
    Log.start(config.debug)

    agg_bulk.S3_CONFIG = config.bulk.s3

    File.new_instance("activedata.pid").write(text(machine_metadata.pid))

    # PIPE REQUEST LOGS TO ES DEBUG
    if config.request_logs:
        cluster = elasticsearch.Cluster(config.request_logs)
        request_logger = cluster.get_or_create_index(config.request_logs)
        active_data.request_log_queue = request_logger.threaded_queue(
            max_size=2000, period=1)

    if config.dockerflow:

        def backend_check():
            http.get_json(config.elasticsearch.host + ":" +
                          text(config.elasticsearch.port))

        dockerflow(flask_app, backend_check)
    else:
        # IF NOT USING DOCKERFLOW, THEN RESPOND WITH A SIMPLER __version__
        add_version(flask_app)

    # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
    container.config.default = {
        "type": "elasticsearch",
        "settings": config.elasticsearch.copy()
    }

    # TRIGGER FIRST INSTANCE
    if config.saved_queries:
        setattr(save_query, "query_finder", SaveQueries(config.saved_queries))

    # STARTUP QUERY STATS
    QueryStats(elasticsearch.Cluster(config.elasticsearch))

    if config.flask.port and config.args.process_num:
        config.flask.port += config.args.process_num

    # TURN ON /exit FOR WINDOWS DEBUGGING
    if config.flask.debug or config.flask.allow_exit:
        config.flask.allow_exit = None
        Log.warning("ActiveData is in debug mode")
        flask_app.add_url_rule('/exit', 'exit', _exit)

    if config.flask.ssl_context:
        if config.args.process_num:
            Log.error("can not serve ssl and multiple Flask instances at once")
        setup_flask_ssl()

    # ENSURE MAIN THREAD SHUTDOWN TRIGGERS Flask SHUTDOWN
    MAIN_THREAD.stopped.then(exit)
Exemplo n.º 35
0
        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}",
                     wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request " + text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()


if __name__ == '__main__':
    try:
        tmp_signal = Signal()
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        queue_consumer(kwargs=config, please_stop=tmp_signal)
        worker = Thread.run("sqs consumer", queue_consumer, kwargs=config)
        MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True,
                                             please_stop=worker.stopped)
    except BaseException as e:
        Log.error("Serious problem with consumer construction! Shutdown!",
                  cause=e)
Exemplo n.º 36
0
    global cache

    cache = Cache(config.cache)
    any_flask_app.add_url_rule(str('/<path:path>'), None, relay_get, methods=[str('GET')])
    any_flask_app.add_url_rule(str('/<path:path>'), None, relay_post, methods=[str('POST')])
    any_flask_app.add_url_rule(str('/'), None, relay_get, methods=[str('GET')])
    any_flask_app.add_url_rule(str('/'), None, relay_post, methods=[str('POST')])


if __name__ in ("__main__",):
    Log.note("Starting " + APP_NAME + " Service App...")
    flask_app = RelayApp(__name__)

    try:
        config = startup.read_settings(
            filename=os.environ.get('HG_RELAY_CONFIG')
        )
        constants.set(config.constants)
        Log.start(config.debug)

        add(flask_app)
        Log.note("Started " + APP_NAME + " Service")
    except BaseException as e:  # MUST CATCH BaseException BECAUSE argparse LIKES TO EXIT THAT WAY, AND gunicorn WILL NOT REPORT
        try:
            Log.error("Serious problem with " + APP_NAME + " service construction!  Shutdown!", cause=e)
        finally:
            Log.stop()

    if config.flask:
        if config.flask.port and config.args.process_num:
            config.flask.port += config.args.process_num