Пример #1
0
    def __enter__(self):
        self.config = config = startup_read_settings()
        from mo_logs import constants

        constants.set(config.constants)
        Log.start(config.debug)
        return config
Пример #2
0
def main():
    try:
        settings = startup.read_settings()
        with startup.SingleInstance(settings.args.filename):
            constants.set(settings.constants)
            Log.start(settings.debug)

            extractor = Extract(settings)

            def extract(please_stop):
                with MySQL(**settings.snowflake.database) as db:
                    with db.transaction():
                        for kwargs in extractor.queue:
                            if please_stop:
                                break
                            try:
                                extractor.extract(db=db,
                                                  please_stop=please_stop,
                                                  **kwargs)
                            except Exception as e:
                                Log.warning("Could not extract", cause=e)
                                extractor.queue.add(kwargs)

            for i in range(settings.extract.threads):
                Thread.run("extract #" + text_type(i), extract)

            please_stop = Signal()
            Thread.wait_for_shutdown_signal(please_stop=please_stop,
                                            allow_exit=True,
                                            wait_forever=False)
    except Exception as e:
        Log.warning("Problem with data extraction", e)
    finally:
        Log.stop()
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        with SingleInstance(flavor_id=settings.args.filename):
            settings.run_interval = Duration(settings.run_interval)
            for u in settings.utility:
                u.discount = coalesce(u.discount, 0)
                # MARKUP drives WITH EXPECTED device MAPPING
                num_ephemeral_volumes = ephemeral_storage[
                    u.instance_type]["num"]
                for i, d in enumerate(d for d in u.drives if not d.device):
                    letter = convert.ascii2char(98 + num_ephemeral_volumes + i)
                    d.device = "/dev/xvd" + letter

            settings.utility = UniqueIndex(["instance_type"],
                                           data=settings.utility)
            instance_manager = new_instance(settings.instance)
            m = SpotManager(instance_manager, kwargs=settings)

            if ENABLE_SIDE_EFFECTS:
                m.update_spot_requests()

            if m.watcher:
                m.watcher.join()
    except Exception as e:
        Log.warning("Problem with spot manager", cause=e)
    finally:
        Log.stop()
        MAIN_THREAD.stop()
Пример #4
0
 def setUpClass(cls):
     try:
         cls.config = startup.read_settings(filename="tests/config.json")
         constants.set(cls.config.constants)
         Log.start(cls.config.debug)
     except Exception as e:
         Log.error("Problem with etl", e)
Пример #5
0
def main():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        with SingleInstance(flavor_id=settings.args.filename):
            constants.set(settings.constants)
            settings.run_interval = Duration(settings.run_interval)
            for u in settings.utility:
                u.discount = coalesce(u.discount, 0)
                # MARKUP drives WITH EXPECTED device MAPPING
                num_ephemeral_volumes = ephemeral_storage[u.instance_type]["num"]
                for i, d in enumerate(d for d in u.drives if not d.device):
                    letter = convert.ascii2char(98 + num_ephemeral_volumes + i)
                    d.device = "/dev/xvd" + letter

            settings.utility = UniqueIndex(["instance_type"], data=settings.utility)
            instance_manager = new_instance(settings.instance)
            m = SpotManager(instance_manager, kwargs=settings)

            if ENABLE_SIDE_EFFECTS:
                m.update_spot_requests(instance_manager.required_utility())

            if m.watcher:
                m.watcher.join()
    except Exception as e:
        Log.warning("Problem with spot manager", cause=e)
    finally:
        Log.stop()
        MAIN_THREAD.stop()
Пример #6
0
def main():
    try:
        settings = startup.read_settings(
            defs=[{
                "name": ["--all", "-a"],
                "action": 'store_true',
                "help": 'process all mo-* subdirectories',
                "dest": "all",
                "required": False
            }, {
                "name": ["--dir", "--directory", "-d"],
                "help": 'directory to deploy',
                "type": str,
                "dest": "directory",
                "required": True,
                "default": "."
            }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        if settings.args.all:
            deploy_all(File(settings.args.directory), settings.prefix,
                       settings)
        else:
            Deploy(File(settings.args.directory), kwargs=settings).deploy()
    except Exception, e:
        Log.warning("Problem with etl", cause=e)
Пример #7
0
def extract_alert_settings(env_setup):
    settings = startup.read_settings(filename=extract_alerts.CONFIG_FILE,
                                     complain=False)
    settings.source.database.ssl = None  # NOT REQUIRED FOR TEST DATABASE
    constants.set(settings.constants)
    Log.start(settings.debug)
    return settings
Пример #8
0
    def start(cls, settings=None):
        """
        RUN ME FIRST TO SETUP THE THREADED LOGGING
        http://victorlin.me/2012/08/good-logging-practice-in-python/

        log       - LIST OF PARAMETERS FOR LOGGER(S)
        trace     - SHOW MORE DETAILS IN EVERY LOG LINE (default False)
        cprofile  - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False)
                    USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"}
        profile   - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):)
                    USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"}
        constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE)
        """
        global _Thread
        if not settings:
            return
        settings = wrap(settings)

        Log.stop()

        cls.settings = settings
        cls.trace = coalesce(settings.trace, False)
        if cls.trace:
            from mo_threads import Thread as _Thread
            _ = _Thread

        # ENABLE CPROFILE
        if settings.cprofile is False:
            settings.cprofile = {"enabled": False}
        elif settings.cprofile is True:
            if isinstance(settings.cprofile, bool):
                settings.cprofile = {"enabled": True, "filename": "cprofile.tab"}
        if settings.cprofile.enabled:
            from mo_threads import profiles
            profiles.enable_profilers(settings.cprofile.filename)

        if settings.profile is True or (isinstance(settings.profile, Mapping) and settings.profile.enabled):
            Log.error("REMOVED 2018-09-02, Activedata revision 3f30ff46f5971776f8ba18")
            # from mo_logs import profiles
            #
            # if isinstance(settings.profile, bool):
            #     profiles.ON = True
            #     settings.profile = {"enabled": True, "filename": "profile.tab"}
            #
            # if settings.profile.enabled:
            #     profiles.ON = True

        if settings.constants:
            constants.set(settings.constants)

        if settings.log:
            cls.logging_multi = StructuredLogger_usingMulti()
            for log in listwrap(settings.log):
                Log.add_log(Log.new_instance(log))

            from mo_logs.log_usingThread import StructuredLogger_usingThread
            cls.main_log = StructuredLogger_usingThread(cls.logging_multi)
Пример #9
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        please_stop = Signal("main stop signal")
        Thread.wait_for_shutdown_signal(please_stop)
    except Exception, e:
        Log.error("Problem with etl", cause=e)
Пример #10
0
def setup(settings=None):
    global config

    try:
        config = startup.read_settings(defs={
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        },
                                       filename=settings)
        constants.set(config.constants)
        Log.start(config.debug)

        if config.args.process_num and config.flask.port:
            config.flask.port += config.args.process_num

        # PIPE REQUEST LOGS TO ES DEBUG
        if config.request_logs:
            request_logger = elasticsearch.Cluster(
                config.request_logs).get_or_create_index(config.request_logs)
            active_data.request_log_queue = request_logger.threaded_queue(
                max_size=2000)

        # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
        containers.config.default = {
            "type": "elasticsearch",
            "settings": config.elasticsearch.copy()
        }

        # TURN ON /exit FOR WINDOWS DEBUGGING
        if config.flask.debug or config.flask.allow_exit:
            config.flask.allow_exit = None
            Log.warning("ActiveData is in debug mode")
            app.add_url_rule('/exit', 'exit', _exit)

        # TRIGGER FIRST INSTANCE
        FromESMetadata(config.elasticsearch)
        if config.saved_queries:
            setattr(save_query, "query_finder",
                    SaveQueries(config.saved_queries))
        HeaderRewriterFix(app, remove_headers=['Date', 'Server'])

        if config.flask.ssl_context:
            if config.args.process_num:
                Log.error(
                    "can not serve ssl and multiple Flask instances at once")
            setup_ssl()

        return app
    except Exception, e:
        Log.error(
            "Serious problem with ActiveData service construction!  Shutdown!",
            cause=e)
Пример #11
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        ETL(settings).setup(settings.instance, settings.utility)
    except Exception as e:
        Log.warning("Problem with setup of ETL", cause=e)
    finally:
        Log.stop()
Пример #12
0
def start():
    try:
        config = json2value(STDIN.readline().decode('utf8'))
        constants.set(config.constants)
        Log.start(set_default(config.debug, {"logs": [{"type": "raw"}]}))
        command_loop({"config": config})
    except Exception as e:
        Log.error("problem staring worker", cause=e)
    finally:
        Log.stop()
Пример #13
0
def main():
    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)
        ETL(settings).setup(settings.instance, settings.utility)
    except Exception as e:
        Log.warning("Problem with setup of ETL", cause=e)
    finally:
        Log.stop()
    def start(cls,
              trace=False,
              cprofile=False,
              constants=None,
              logs=None,
              app_name=None,
              settings=None):
        """
        RUN ME FIRST TO SETUP THE THREADED LOGGING
        https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/

        :param trace: SHOW MORE DETAILS IN EVERY LOG LINE (default False)
        :param cprofile: True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False)
                         USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"}
        :param constants: UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE)
        :param logs: LIST OF PARAMETERS FOR LOGGER(S)
        :param app_name: GIVE THIS APP A NAME, AND RETURN A CONTEXT MANAGER
        :param settings: ALL THE ABOVE PARAMTERS
        :return:
        """
        global _Thread  # REQUIRED FOR trace
        if app_name:
            return LoggingContext(app_name)

        Log.stop()

        cls.settings = settings
        cls.trace = trace
        if trace:
            from mo_threads import Thread as _Thread
            _ = _Thread

        # ENABLE CPROFILE
        if cprofile is False:
            cprofile = settings.cprofile = Data(enabled=False)
        elif cprofile is True:
            cprofile = settings.cprofile = Data(enabled=True,
                                                filename="cprofile.tab")
        if is_data(cprofile) and cprofile.enabled:
            from mo_threads import profiles
            profiles.enable_profilers(settings.cprofile.filename)

        if constants:
            _constants.set(constants)

        logs = coalesce(settings.log, logs)
        if logs:
            cls.logging_multi = StructuredLogger_usingMulti()
            for log in listwrap(logs):
                Log._add_log(Log.new_instance(log))

            from mo_logs.log_usingThread import StructuredLogger_usingThread
            old_log, cls.main_log = cls.main_log, StructuredLogger_usingThread(
                cls.logging_multi)
            old_log.stop()
Пример #15
0
    def setUpClass(cls):
        global config, broker
        try:
            config = startup.read_settings(filename="tests/config/file.json")
            constants.set(config.constants)
            Log.start(config.debug)

            File(config.broker.backing.directory).delete()
            broker = Broker(kwargs=config.broker)
        except Exception as e:
            Log.error("could not setup for testing", cause=e)
Пример #16
0
def main():
    settings = startup.read_settings()
    Log.start(settings.debug)
    constants.set(settings.constants)

    try:
        _synch(settings)
    except Exception as e:
        Log.error("Problem with synch", e)
    finally:
        Log.stop()
Пример #17
0
def main():
    global config
    global hg
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        hg = HgMozillaOrg(config)
        Log.start(config.debug)

    except Exception as e:
        Log.error("Problem with etl", e)
Пример #18
0
def start():
    try:
        line = STDIN.readline().decode("utf8")
        config = json2value(line)
        constants.set(config.constants)
        Log.start(config.debug)
        Log.set_logger(RawLogger())
        command_loop({"config": config})
    except Exception as e:
        Log.error("problem staring worker", cause=e)
    finally:
        Log.stop()
Пример #19
0
    def run(self, force=False, restart=False, start=None, merge=False):
        try:
            # SETUP LOGGING
            settings = startup.read_settings(filename=CONFIG_FILE)
            constants.set(settings.constants)
            Log.start(settings.debug)

            self.extract(settings, force, restart, start, merge)
        except Exception as e:
            Log.error("could not extract jobs", cause=e)
        finally:
            Log.stop()
Пример #20
0
def main():
    global config
    global hg
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)
        hg = HgMozillaOrg(config)
        random = _parse_diff(
            Data(changeset={"id": "2d9d0bebb5c6"},
                 branch={"url": "https://hg.mozilla.org/mozilla-central"}))
    except Exception as e:
        Log.error("Problem with etl", e)
Пример #21
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception as e:
        Log.error("Problem with etl", e)
    finally:
        Log.stop()
Пример #22
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        inject_secrets(config)

        with Timer("PATCH ADR: dd update() method to Configuration class"):

            def update(self, config):
                """
                Update the configuration object with new parameters
                :param config: dict of configuration
                """
                for k, v in config.items():
                    if v != None:
                        self._config[k] = v

                self._config["sources"] = sorted(
                    map(os.path.expanduser, set(self._config["sources"]))
                )

                # Use the NullStore by default. This allows us to control whether
                # caching is enabled or not at runtime.
                self._config["cache"].setdefault("stores", {"null": {"driver": "null"}})
                object.__setattr__(self, "cache", CacheManager(self._config["cache"]))
                self.cache.extend("null", lambda driver: NullStore())

            setattr(Configuration, "update", update)

        # UPDATE ADR COFIGURATION
        adr.config.update(config.adr)

        Log.start(config.debug)

        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        loguru.logger.remove()
        loguru.logger.add(
            _logging, level="DEBUG", format="{message}", filter=lambda r: True,
        )

        Schedulers(config).process()
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Пример #23
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception as e:
        Log.error("Problem with etl", e)
    finally:
        Log.stop()
Пример #24
0
def extract_job_settings():
    # These values not directly accessed during testing, but the code requires that they be present.
    os.environ["NEW_RELIC_APP_NAME"] = "testing"
    os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1"
    os.environ["BIGQUERY_PRIVATE_KEY"] = "1"

    # USE THE TEST SCHEMA
    db_url = os.environ["DATABASE_URL"]
    db_url = db_url.replace(strings.between(db_url, "/", None),
                            DATABASES["default"]["TEST"]["NAME"])
    os.environ["DATABASE_URL"] = db_url

    settings = startup.read_settings(filename=extract_jobs.CONFIG_FILE,
                                     complain=False)
    settings.source.database.ssl = None  # NOT REQUIRED FOR TEST DATABASE
    constants.set(settings.constants)
    Log.start(settings.debug)
    return settings
Пример #25
0
    def test_set_self(self):
        constants.set({"tests": {"test_constants": {"CONSTANT": False}}})
        self.assertEqual(CONSTANT, False, "expecting change")

        constants.set({"tests": {"test_constants": {"CONSTANT": True}}})
        self.assertEqual(CONSTANT, True, "expecting change")

        constants.set({"mo_logs": {"constants": {"DEBUG": 42}}})
        self.assertEqual(constants.DEBUG, 42, "expecting change")

        constants.set({"tests": {"test_constants": {"CONSTANT": "true"}}})
        self.assertEqual(CONSTANT, "true", "expecting change")
Пример #26
0
def setup():
    global config

    config = startup.read_settings(
        filename=os.environ.get('ACTIVEDATA_CONFIG'),
        defs=[
            {
                "name": ["--process_num", "--process"],
                "help": "Additional port offset (for multiple Flask processes",
                "type": int,
                "dest": "process_num",
                "default": 0,
                "required": False
            }
        ]
    )

    constants.set(config.constants)
    Log.start(config.debug)

    # PIPE REQUEST LOGS TO ES DEBUG
    if config.request_logs:
        cluster = elasticsearch.Cluster(config.request_logs)
        request_logger = cluster.get_or_create_index(config.request_logs)
        active_data.request_log_queue = request_logger.threaded_queue(max_size=2000)

    if config.dockerflow:
        def backend_check():
            http.get_json(config.elasticsearch.host + ":" + text_type(config.elasticsearch.port))
        dockerflow(flask_app, backend_check)


    # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
    container.config.default = {
        "type": "elasticsearch",
        "settings": config.elasticsearch.copy()
    }

    # TRIGGER FIRST INSTANCE
    if config.saved_queries:
        setattr(save_query, "query_finder", SaveQueries(config.saved_queries))

    HeaderRewriterFix(flask_app, remove_headers=['Date', 'Server'])
Пример #27
0
def main():
    try:
        config = startup.read_settings(defs=[{
            "name": ["--file"],
            "help": "file to save backup",
            "type": str,
            "dest": "file",
            "required": True
        }])
        constants.set(config.constants)
        Log.start(config.debug)

        sq = elasticsearch.Index(kwargs=config.saved_queries)
        result = sq.search({"query": {"match_all": {}}, "size": 200000})

        File(config.args.file).write("".join(
            map(convert.json2value, result.hits.hits)))

    except Exception, e:
        Log.error("Problem with etl", e)
Пример #28
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        hg = HgMozillaOrg(settings)
        todo = Queue()
        todo.add("97160a734959")
        least = 100000

        while todo:
            next_ = todo.pop()
            curr = hg.get_revision(
                wrap({
                    "changeset": {
                        "id": next_
                    },
                    "branch": {
                        "name": BRANCH
                    }
                }))
            if len(curr.changeset.files) > MIN_FILES:
                diff = hg._get_json_diff_from_hg(curr)
                num_changes = sum(len(d.changes) for d in diff)
                score = num_changes / len(diff)
                if score < least:
                    least = score
                    Log.note(
                        "smallest = {{rev}}, num_lines={{num}}, num_files={{files}}",
                        rev=curr.changeset.id,
                        num=num_changes,
                        files=len(diff))
            todo.extend(listwrap(curr.parents))

    except Exception as e:
        Log.error("Problem with scna", e)
    finally:
        Log.stop()
Пример #29
0
        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}",
                     wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request " + text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()


if __name__ == '__main__':
    try:
        tmp_signal = Signal()
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        queue_consumer(kwargs=config, please_stop=tmp_signal)
        worker = Thread.run("sqs consumer", queue_consumer, kwargs=config)
        MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True,
                                             please_stop=worker.stopped)
    except BaseException as e:
        Log.error("Serious problem with consumer construction! Shutdown!",
                  cause=e)
Пример #30
0
            Log.note("Skipping try revision.")
            queue.commit()
            continue

        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request "+text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()


if __name__ == '__main__':
    try:
        tmp_signal = Signal()
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        queue_consumer(kwargs=config, please_stop=tmp_signal)
        worker = Thread.run("sqs consumer", queue_consumer, kwargs=config)
        MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped)
    except BaseException as e:
        Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
Пример #31
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                # get_ids = sql_query(
                #     {
                #         "from": "job",
                #         "select": ["id"],
                #         "where": {
                #             "or": [
                #                 {"gt": {"last_modified": parse(last_modified)}},
                #                 {
                #                     "and": [
                #                         {"eq": {"last_modified": parse(last_modified)}},
                #                         {"gt": {"id": job_id}},
                #                     ]
                #                 },
                #             ]
                #         },
                #         "sort": ["last_modified", "id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )

                get_ids = SQL(str(
                    (
                        Job.objects.filter(
                            Q(last_modified__gt=parse(last_modified).datetime)
                            | (
                                Q(last_modified=parse(last_modified).datetime)
                                & Q(id__gt=job_id)
                            )
                        )
                        .annotate()
                        .values("id")
                        .order_by("last_modified", "id")[
                            : settings.extractor.chunk_size
                        ]
                    ).query
                ))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Пример #32
0

container_types = Data(elasticsearch=ESUtils, )

try:
    # read_alternate_settings
    filename = os.environ.get("TEST_CONFIG")
    if filename:
        test_jx.global_settings = mo_json_config.get("file://" + filename)
    else:
        Log.alert(
            "No TEST_CONFIG environment variable to point to config file.  Using "
            + DEFAULT_TEST_CONFIG)
        test_jx.global_settings = mo_json_config.get("file://" +
                                                     DEFAULT_TEST_CONFIG)
    constants.set(test_jx.global_settings.constants)
    Log.start(test_jx.global_settings.debug)

    if not test_jx.global_settings.use:
        Log.error('Must have a {"use": type} set in the config file')

    test_jx.global_settings.elasticsearch.version = Cluster(
        test_jx.global_settings.elasticsearch).version
    test_jx.utils = container_types[test_jx.global_settings.use](
        test_jx.global_settings)
except Exception as e:
    Log.warning("problem", cause=e)

Log.alert("Resetting test count")
NEXT = 0
Пример #33
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        # SHUNT PYTHON LOGGING TO MAIN LOGGING
        capture_logging()
        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        capture_loguru()

        if config.taskcluster:
            inject_secrets(config)

        @extend(Configuration)
        def update(self, config):
            """
            Update the configuration object with new parameters
            :param config: dict of configuration
            """
            for k, v in config.items():
                if v != None:
                    self._config[k] = v

            self._config["sources"] = sorted(
                map(os.path.expanduser, set(self._config["sources"])))

            # Use the NullStore by default. This allows us to control whether
            # caching is enabled or not at runtime.
            self._config["cache"].setdefault("stores",
                                             {"null": {
                                                 "driver": "null"
                                             }})
            object.__setattr__(self, "cache", CustomCacheManager(self._config))
            for _, store in self._config["cache"]["stores"].items():
                if store.path and not store.path.endswith("/"):
                    # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY
                    store.path = store.path + "/"

        if SHOW_S3_CACHE_HIT:
            s3_get = S3Store._get

            @extend(S3Store)
            def _get(self, key):
                with Timer("get {{key}} from S3", {"key": key},
                           verbose=False) as timer:
                    output = s3_get(self, key)
                    if output is not None:
                        timer.verbose = True
                    return output

        # UPDATE ADR CONFIGURATION
        with Repeat("waiting for ADR", every="10second"):
            adr.config.update(config.adr)
            # DUMMY TO TRIGGER CACHE
            make_push_objects(from_date=Date.today().format(),
                              to_date=Date.now().format(),
                              branch="autoland")

        outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds())
        outatime.then(lambda: Log.alert("Out of time, exit early"))
        Schedulers(config).process(outatime)
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Пример #34
0
def config():
    config = startup.read_settings(filename=os.environ.get('TUID_CONFIG'))
    constants.set(config.constants)
    Log.start(config.debug)
    return config