def __enter__(self): self.config = config = startup_read_settings() from mo_logs import constants constants.set(config.constants) Log.start(config.debug) return config
def main(): try: settings = startup.read_settings() with startup.SingleInstance(settings.args.filename): constants.set(settings.constants) Log.start(settings.debug) extractor = Extract(settings) def extract(please_stop): with MySQL(**settings.snowflake.database) as db: with db.transaction(): for kwargs in extractor.queue: if please_stop: break try: extractor.extract(db=db, please_stop=please_stop, **kwargs) except Exception as e: Log.warning("Could not extract", cause=e) extractor.queue.add(kwargs) for i in range(settings.extract.threads): Thread.run("extract #" + text_type(i), extract) please_stop = Signal() Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True, wait_forever=False) except Exception as e: Log.warning("Problem with data extraction", e) finally: Log.stop()
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[ u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests() if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def setUpClass(cls): try: cls.config = startup.read_settings(filename="tests/config.json") constants.set(cls.config.constants) Log.start(cls.config.debug) except Exception as e: Log.error("Problem with etl", e)
def main(): try: settings = startup.read_settings() Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): constants.set(settings.constants) settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests(instance_manager.required_utility()) if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def main(): try: settings = startup.read_settings( defs=[{ "name": ["--all", "-a"], "action": 'store_true', "help": 'process all mo-* subdirectories', "dest": "all", "required": False }, { "name": ["--dir", "--directory", "-d"], "help": 'directory to deploy', "type": str, "dest": "directory", "required": True, "default": "." }]) constants.set(settings.constants) Log.start(settings.debug) if settings.args.all: deploy_all(File(settings.args.directory), settings.prefix, settings) else: Deploy(File(settings.args.directory), kwargs=settings).deploy() except Exception, e: Log.warning("Problem with etl", cause=e)
def extract_alert_settings(env_setup): settings = startup.read_settings(filename=extract_alerts.CONFIG_FILE, complain=False) settings.source.database.ssl = None # NOT REQUIRED FOR TEST DATABASE constants.set(settings.constants) Log.start(settings.debug) return settings
def start(cls, settings=None): """ RUN ME FIRST TO SETUP THE THREADED LOGGING http://victorlin.me/2012/08/good-logging-practice-in-python/ log - LIST OF PARAMETERS FOR LOGGER(S) trace - SHOW MORE DETAILS IN EVERY LOG LINE (default False) cprofile - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False) USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"} profile - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):) USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"} constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE) """ global _Thread if not settings: return settings = wrap(settings) Log.stop() cls.settings = settings cls.trace = coalesce(settings.trace, False) if cls.trace: from mo_threads import Thread as _Thread _ = _Thread # ENABLE CPROFILE if settings.cprofile is False: settings.cprofile = {"enabled": False} elif settings.cprofile is True: if isinstance(settings.cprofile, bool): settings.cprofile = {"enabled": True, "filename": "cprofile.tab"} if settings.cprofile.enabled: from mo_threads import profiles profiles.enable_profilers(settings.cprofile.filename) if settings.profile is True or (isinstance(settings.profile, Mapping) and settings.profile.enabled): Log.error("REMOVED 2018-09-02, Activedata revision 3f30ff46f5971776f8ba18") # from mo_logs import profiles # # if isinstance(settings.profile, bool): # profiles.ON = True # settings.profile = {"enabled": True, "filename": "profile.tab"} # # if settings.profile.enabled: # profiles.ON = True if settings.constants: constants.set(settings.constants) if settings.log: cls.logging_multi = StructuredLogger_usingMulti() for log in listwrap(settings.log): Log.add_log(Log.new_instance(log)) from mo_logs.log_usingThread import StructuredLogger_usingThread cls.main_log = StructuredLogger_usingThread(cls.logging_multi)
def main(): try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) please_stop = Signal("main stop signal") Thread.wait_for_shutdown_signal(please_stop) except Exception, e: Log.error("Problem with etl", cause=e)
def setup(settings=None): global config try: config = startup.read_settings(defs={ "name": ["--process_num", "--process"], "help": "Additional port offset (for multiple Flask processes", "type": int, "dest": "process_num", "default": 0, "required": False }, filename=settings) constants.set(config.constants) Log.start(config.debug) if config.args.process_num and config.flask.port: config.flask.port += config.args.process_num # PIPE REQUEST LOGS TO ES DEBUG if config.request_logs: request_logger = elasticsearch.Cluster( config.request_logs).get_or_create_index(config.request_logs) active_data.request_log_queue = request_logger.threaded_queue( max_size=2000) # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY containers.config.default = { "type": "elasticsearch", "settings": config.elasticsearch.copy() } # TURN ON /exit FOR WINDOWS DEBUGGING if config.flask.debug or config.flask.allow_exit: config.flask.allow_exit = None Log.warning("ActiveData is in debug mode") app.add_url_rule('/exit', 'exit', _exit) # TRIGGER FIRST INSTANCE FromESMetadata(config.elasticsearch) if config.saved_queries: setattr(save_query, "query_finder", SaveQueries(config.saved_queries)) HeaderRewriterFix(app, remove_headers=['Date', 'Server']) if config.flask.ssl_context: if config.args.process_num: Log.error( "can not serve ssl and multiple Flask instances at once") setup_ssl() return app except Exception, e: Log.error( "Serious problem with ActiveData service construction! Shutdown!", cause=e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) ETL(settings).setup(settings.instance, settings.utility) except Exception as e: Log.warning("Problem with setup of ETL", cause=e) finally: Log.stop()
def start(): try: config = json2value(STDIN.readline().decode('utf8')) constants.set(config.constants) Log.start(set_default(config.debug, {"logs": [{"type": "raw"}]})) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def start(cls, trace=False, cprofile=False, constants=None, logs=None, app_name=None, settings=None): """ RUN ME FIRST TO SETUP THE THREADED LOGGING https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/ :param trace: SHOW MORE DETAILS IN EVERY LOG LINE (default False) :param cprofile: True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False) USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"} :param constants: UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE) :param logs: LIST OF PARAMETERS FOR LOGGER(S) :param app_name: GIVE THIS APP A NAME, AND RETURN A CONTEXT MANAGER :param settings: ALL THE ABOVE PARAMTERS :return: """ global _Thread # REQUIRED FOR trace if app_name: return LoggingContext(app_name) Log.stop() cls.settings = settings cls.trace = trace if trace: from mo_threads import Thread as _Thread _ = _Thread # ENABLE CPROFILE if cprofile is False: cprofile = settings.cprofile = Data(enabled=False) elif cprofile is True: cprofile = settings.cprofile = Data(enabled=True, filename="cprofile.tab") if is_data(cprofile) and cprofile.enabled: from mo_threads import profiles profiles.enable_profilers(settings.cprofile.filename) if constants: _constants.set(constants) logs = coalesce(settings.log, logs) if logs: cls.logging_multi = StructuredLogger_usingMulti() for log in listwrap(logs): Log._add_log(Log.new_instance(log)) from mo_logs.log_usingThread import StructuredLogger_usingThread old_log, cls.main_log = cls.main_log, StructuredLogger_usingThread( cls.logging_multi) old_log.stop()
def setUpClass(cls): global config, broker try: config = startup.read_settings(filename="tests/config/file.json") constants.set(config.constants) Log.start(config.debug) File(config.broker.backing.directory).delete() broker = Broker(kwargs=config.broker) except Exception as e: Log.error("could not setup for testing", cause=e)
def main(): settings = startup.read_settings() Log.start(settings.debug) constants.set(settings.constants) try: _synch(settings) except Exception as e: Log.error("Problem with synch", e) finally: Log.stop()
def main(): global config global hg try: config = startup.read_settings() constants.set(config.constants) hg = HgMozillaOrg(config) Log.start(config.debug) except Exception as e: Log.error("Problem with etl", e)
def start(): try: line = STDIN.readline().decode("utf8") config = json2value(line) constants.set(config.constants) Log.start(config.debug) Log.set_logger(RawLogger()) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def run(self, force=False, restart=False, start=None, merge=False): try: # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) self.extract(settings, force, restart, start, merge) except Exception as e: Log.error("could not extract jobs", cause=e) finally: Log.stop()
def main(): global config global hg try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) hg = HgMozillaOrg(config) random = _parse_diff( Data(changeset={"id": "2d9d0bebb5c6"}, branch={"url": "https://hg.mozilla.org/mozilla-central"})) except Exception as e: Log.error("Problem with etl", e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) branches = _get_branches_from_hg(settings.hg) es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches) es.add_alias() es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches) Log.alert("DONE!") except Exception as e: Log.error("Problem with etl", e) finally: Log.stop()
def main(): try: config = startup.read_settings() constants.set(config.constants) inject_secrets(config) with Timer("PATCH ADR: dd update() method to Configuration class"): def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"])) ) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": {"driver": "null"}}) object.__setattr__(self, "cache", CacheManager(self._config["cache"])) self.cache.extend("null", lambda driver: NullStore()) setattr(Configuration, "update", update) # UPDATE ADR COFIGURATION adr.config.update(config.adr) Log.start(config.debug) # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add loguru.logger.remove() loguru.logger.add( _logging, level="DEBUG", format="{message}", filter=lambda r: True, ) Schedulers(config).process() except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()
def extract_job_settings(): # These values not directly accessed during testing, but the code requires that they be present. os.environ["NEW_RELIC_APP_NAME"] = "testing" os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1" os.environ["BIGQUERY_PRIVATE_KEY"] = "1" # USE THE TEST SCHEMA db_url = os.environ["DATABASE_URL"] db_url = db_url.replace(strings.between(db_url, "/", None), DATABASES["default"]["TEST"]["NAME"]) os.environ["DATABASE_URL"] = db_url settings = startup.read_settings(filename=extract_jobs.CONFIG_FILE, complain=False) settings.source.database.ssl = None # NOT REQUIRED FOR TEST DATABASE constants.set(settings.constants) Log.start(settings.debug) return settings
def test_set_self(self): constants.set({"tests": {"test_constants": {"CONSTANT": False}}}) self.assertEqual(CONSTANT, False, "expecting change") constants.set({"tests": {"test_constants": {"CONSTANT": True}}}) self.assertEqual(CONSTANT, True, "expecting change") constants.set({"mo_logs": {"constants": {"DEBUG": 42}}}) self.assertEqual(constants.DEBUG, 42, "expecting change") constants.set({"tests": {"test_constants": {"CONSTANT": "true"}}}) self.assertEqual(CONSTANT, "true", "expecting change")
def setup(): global config config = startup.read_settings( filename=os.environ.get('ACTIVEDATA_CONFIG'), defs=[ { "name": ["--process_num", "--process"], "help": "Additional port offset (for multiple Flask processes", "type": int, "dest": "process_num", "default": 0, "required": False } ] ) constants.set(config.constants) Log.start(config.debug) # PIPE REQUEST LOGS TO ES DEBUG if config.request_logs: cluster = elasticsearch.Cluster(config.request_logs) request_logger = cluster.get_or_create_index(config.request_logs) active_data.request_log_queue = request_logger.threaded_queue(max_size=2000) if config.dockerflow: def backend_check(): http.get_json(config.elasticsearch.host + ":" + text_type(config.elasticsearch.port)) dockerflow(flask_app, backend_check) # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY container.config.default = { "type": "elasticsearch", "settings": config.elasticsearch.copy() } # TRIGGER FIRST INSTANCE if config.saved_queries: setattr(save_query, "query_finder", SaveQueries(config.saved_queries)) HeaderRewriterFix(flask_app, remove_headers=['Date', 'Server'])
def main(): try: config = startup.read_settings(defs=[{ "name": ["--file"], "help": "file to save backup", "type": str, "dest": "file", "required": True }]) constants.set(config.constants) Log.start(config.debug) sq = elasticsearch.Index(kwargs=config.saved_queries) result = sq.search({"query": {"match_all": {}}, "size": 200000}) File(config.args.file).write("".join( map(convert.json2value, result.hits.hits))) except Exception, e: Log.error("Problem with etl", e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) hg = HgMozillaOrg(settings) todo = Queue() todo.add("97160a734959") least = 100000 while todo: next_ = todo.pop() curr = hg.get_revision( wrap({ "changeset": { "id": next_ }, "branch": { "name": BRANCH } })) if len(curr.changeset.files) > MIN_FILES: diff = hg._get_json_diff_from_hg(curr) num_changes = sum(len(d.changes) for d in diff) score = num_changes / len(diff) if score < least: least = score Log.note( "smallest = {{rev}}, num_lines={{num}}, num_files={{files}}", rev=curr.changeset.id, num=num_changes, files=len(diff)) todo.extend(listwrap(curr.parents)) except Exception as e: Log.error("Problem with scna", e) finally: Log.stop()
now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request " + text_type(request_count), one_request, request) request_count += 1 queue.commit() if __name__ == '__main__': try: tmp_signal = Signal() config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) queue_consumer(kwargs=config, please_stop=tmp_signal) worker = Thread.run("sqs consumer", queue_consumer, kwargs=config) MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped) except BaseException as e: Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request "+text_type(request_count), one_request, request) request_count += 1 queue.commit() if __name__ == '__main__': try: tmp_signal = Signal() config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) queue_consumer(kwargs=config, please_stop=tmp_signal) worker = Thread.run("sqs consumer", queue_consumer, kwargs=config) MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped) except BaseException as e: Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) # get_ids = sql_query( # { # "from": "job", # "select": ["id"], # "where": { # "or": [ # {"gt": {"last_modified": parse(last_modified)}}, # { # "and": [ # {"eq": {"last_modified": parse(last_modified)}}, # {"gt": {"id": job_id}}, # ] # }, # ] # }, # "sort": ["last_modified", "id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL(str( ( Job.objects.filter( Q(last_modified__gt=parse(last_modified).datetime) | ( Q(last_modified=parse(last_modified).datetime) & Q(id__gt=job_id) ) ) .annotate() .values("id") .order_by("last_modified", "id")[ : settings.extractor.chunk_size ] ).query )) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
container_types = Data(elasticsearch=ESUtils, ) try: # read_alternate_settings filename = os.environ.get("TEST_CONFIG") if filename: test_jx.global_settings = mo_json_config.get("file://" + filename) else: Log.alert( "No TEST_CONFIG environment variable to point to config file. Using " + DEFAULT_TEST_CONFIG) test_jx.global_settings = mo_json_config.get("file://" + DEFAULT_TEST_CONFIG) constants.set(test_jx.global_settings.constants) Log.start(test_jx.global_settings.debug) if not test_jx.global_settings.use: Log.error('Must have a {"use": type} set in the config file') test_jx.global_settings.elasticsearch.version = Cluster( test_jx.global_settings.elasticsearch).version test_jx.utils = container_types[test_jx.global_settings.use]( test_jx.global_settings) except Exception as e: Log.warning("problem", cause=e) Log.alert("Resetting test count") NEXT = 0
def main(): try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) # SHUNT PYTHON LOGGING TO MAIN LOGGING capture_logging() # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add capture_loguru() if config.taskcluster: inject_secrets(config) @extend(Configuration) def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"]))) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": { "driver": "null" }}) object.__setattr__(self, "cache", CustomCacheManager(self._config)) for _, store in self._config["cache"]["stores"].items(): if store.path and not store.path.endswith("/"): # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY store.path = store.path + "/" if SHOW_S3_CACHE_HIT: s3_get = S3Store._get @extend(S3Store) def _get(self, key): with Timer("get {{key}} from S3", {"key": key}, verbose=False) as timer: output = s3_get(self, key) if output is not None: timer.verbose = True return output # UPDATE ADR CONFIGURATION with Repeat("waiting for ADR", every="10second"): adr.config.update(config.adr) # DUMMY TO TRIGGER CACHE make_push_objects(from_date=Date.today().format(), to_date=Date.now().format(), branch="autoland") outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds()) outatime.then(lambda: Log.alert("Out of time, exit early")) Schedulers(config).process(outatime) except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()
def config(): config = startup.read_settings(filename=os.environ.get('TUID_CONFIG')) constants.set(config.constants) Log.start(config.debug) return config