def get_and_configure_logger(name: str) -> logging.Logger: unconfigure_root_logger() global LOG_LEVEL if LOG_LEVEL is None: LOG_LEVEL = get_env_variable_gracefully("LOG_LEVEL", "INFO") # Set level to a environment variable; I think at least logger = daiquiri.getLogger(name) logger.setLevel(logging.getLevelName(LOG_LEVEL)) # This is the local handler handler = logging.StreamHandler(sys.stdout) handler.setFormatter( daiquiri.formatter.ColorExtrasFormatter(fmt=FORMAT_STRING, keywords=[])) logger.logger.addHandler(handler) # This is the Sentry handler if "data_refinery_api" in name: raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False) else: raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False) if raven_dsn: from raven.contrib.django.handlers import SentryHandler handler = SentryHandler() handler.setFormatter( daiquiri.formatter.ColorExtrasFormatter(fmt=FORMAT_STRING, keywords=[])) handler.setLevel(logging.WARNING) logger.logger.addHandler(handler) return logger
def process_response(self, request, response): response["X-Source-Revision"] = get_env_variable_gracefully( "SYSTEM_VERSION") # allow browsers to use the api directly and access the version # https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS#Access-Control-Expose-Headers response["Access-Control-Expose-Headers"] = "X-Source-Revision" return response
def handle_survey_jobs(jobs: List[SurveyJob]) -> None: """For each job in jobs, either retry it or log it.""" nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) # Maximum number of total jobs running at a time. # We do this now rather than import time for testing purposes. MAX_TOTAL_JOBS = int( get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS)) len_all_jobs = len(nomad_client.jobs.get_jobs()) if len_all_jobs >= MAX_TOTAL_JOBS: logger.info("Not requeuing job until we're running fewer jobs.") return False jobs_dispatched = 0 for count, job in enumerate(jobs): if job.num_retries < MAX_NUM_RETRIES: requeue_survey_job(job) jobs_dispatched = jobs_dispatched + 1 else: handle_repeated_failure(job) if (count % 100) == 0: len_all_jobs = len(nomad_client.jobs.get_jobs()) if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS: logger.info( "We hit the maximum total jobs ceiling, so we're not handling any more survey jobs now." ) return False return True
def handle_processor_jobs(jobs: List[ProcessorJob]) -> None: """For each job in jobs, either retry it or log it.""" nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) # Maximum number of total jobs running at a time. # We do this now rather than import time for testing purposes. MAX_TOTAL_JOBS = int( get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS)) len_all_jobs = len(nomad_client.jobs.get_jobs()) if len_all_jobs >= MAX_TOTAL_JOBS: logger.info("Not requeuing job until we're running fewer jobs.") return False # We want zebrafish data first, then hgu133plus2, then data # related to pediatric cancer, then to finish salmon experiments # that are close to completion. # Each function moves the jobs it prioritizes to the front of the # list, so apply them in backwards order. # jobs = prioritize_salmon_jobs(jobs) # jobs = prioritize_jobs_by_accession(jobs, PEDIATRIC_ACCESSION_LIST) # jobs = prioritize_jobs_by_accession(jobs, HGU133PLUS2_ACCESSION_LIST) # jobs = prioritize_zebrafish_jobs(jobs) jobs_dispatched = 0 for count, job in enumerate(jobs): if job.num_retries < MAX_NUM_RETRIES: requeue_processor_job(job) jobs_dispatched = jobs_dispatched + 1 else: handle_repeated_failure(job) if (count % 100) == 0: len_all_jobs = len(nomad_client.jobs.get_jobs()) if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS: logger.info( "We hit the maximum total jobs ceiling, so we're not handling any more processor jobs now." ) return False return True
Organism, OrganismIndex, OriginalFile, Pipeline, Processor, ProcessorJob, ) from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully from data_refinery_workers.processors import utils logger = get_and_configure_logger(__name__) JOB_DIR_PREFIX = "processor_job_" GENE_TO_TRANSCRIPT_TEMPLATE = "{gene_id}\t{transcript_id}\n" GENE_TYPE_COLUMN = 2 S3_TRANSCRIPTOME_INDEX_BUCKET_NAME = get_env_variable_gracefully("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME", False) LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store") # Removes each occurrance of ; and " IDS_CLEANUP_TABLE = str.maketrans({";": None, "\"": None}) def _compute_paths(job_context: Dict) -> str: """Computes the paths for all the directories used/created by this processor. Also computes a couple other path-based properties and adds them to the job_context. """ # All files for the job are in the same directory. first_file_path = job_context["original_files"][0].absolute_file_path job_context["base_file_path"] = '/'.join(first_file_path.split('/')[:-1]) job_context["work_dir"] = job_context["base_file_path"] + '/' + job_context["length"].upper() + '/' + \ JOB_DIR_PREFIX + str(job_context["job_id"]) + "/"
# https://docs.djangoproject.com/en/2.2/topics/cache/ CACHES = { "default": { "BACKEND": "django.core.cache.backends.db.DatabaseCache", "LOCATION": "cache_table", } } # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False) if raven_dsn: RAVEN_CONFIG = {"dsn": raven_dsn} else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger("raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL) RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True" ELASTICSEARCH_DSL = { "default": { "hosts": get_env_variable("ELASTICSEARCH_HOST") + ":"
"rest_framework.pagination.LimitOffsetPagination", "PAGE_SIZE": 25, "DEFAULT_VERSIONING_CLASS": "rest_framework.versioning.URLPathVersioning", } SWAGGER_SETTINGS = { "USE_SESSION_AUTH": False, "SECURITY_DEFINITIONS": {}, } # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False) if raven_dsn != "not set": RAVEN_CONFIG = { "dsn": raven_dsn, # Only send 5% of errors for the API, since we aren't going to # be interested in any single one. "sampleRate": 0.25, } else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger( "raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL)
"DEFAULT_VERSIONING_CLASS": "rest_framework.versioning.URLPathVersioning", "EXCEPTION_HANDLER": "data_refinery_api.exceptions.custom_exception_handler", } SWAGGER_SETTINGS = { "USE_SESSION_AUTH": False, "SECURITY_DEFINITIONS": {}, } # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False) if raven_dsn != "not set": RAVEN_CONFIG = { "dsn": raven_dsn, # Only send 5% of errors for the API, since we aren't going to # be interested in any single one. "sampleRate": 0.25, } else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger( "raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL)
def process_response(self, request, response): response['X-Source-Revision'] = get_env_variable_gracefully( "SYSTEM_VERSION") return response
USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_URL = "/static/" # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN", None) # AWS Secrets manager will not let us store an empty string. if RAVEN_DSN == "None": RAVEN_DSN = None if RAVEN_DSN: RAVEN_CONFIG = {"dsn": RAVEN_DSN} else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger( "raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL)
USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_URL = '/static/' # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. raven_dsn = get_env_variable_gracefully('RAVEN_DSN', False) if raven_dsn: RAVEN_CONFIG = {'dsn': raven_dsn} else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger( 'raven.contrib.django.client.DjangoClient') raven_logger.setLevel(logging.CRITICAL) RUNNING_IN_CLOUD = get_env_variable('RUNNING_IN_CLOUD') == "True" ELASTICSEARCH_DSL = { 'default': { 'hosts': get_env_variable('ELASTICSEARCH_HOST') + ":" +
USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_URL = "/static/" # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False) if raven_dsn: RAVEN_CONFIG = {"dsn": raven_dsn} else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger( "raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL) RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True" # EngagementBot ENGAGEMENTBOT_WEBHOOK = get_env_variable_gracefully("ENGAGEMENTBOT_WEBHOOK")
"rest_framework.throttling.UserRateThrottle", ], "DEFAULT_THROTTLE_RATES": {"anon": "10/second", "user": "******"}, } SWAGGER_SETTINGS = { "USE_SESSION_AUTH": False, "SECURITY_DEFINITIONS": {}, } # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN_API", None) # AWS Secrets manager will not let us store an empty string. if RAVEN_DSN == "None": RAVEN_DSN = None if RAVEN_DSN != "not set": RAVEN_CONFIG = { "dsn": RAVEN_DSN, # Only send 5% of errors for the API, since we aren't going to # be interested in any single one. "sampleRate": 0.25, } else: # Preven raven from logging about how it's not configured... import logging
# https://docs.djangoproject.com/en/2.2/topics/cache/ CACHES = { "default": { "BACKEND": "django.core.cache.backends.db.DatabaseCache", "LOCATION": "cache_table", } } # Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the # following warning: # /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91: # UserWarning: Transport selection via DSN is deprecated. You should # explicitly pass the transport class to Client() instead. RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN", None) # AWS Secrets manager will not let us store an empty string. if RAVEN_DSN == "None": RAVEN_DSN = None if RAVEN_DSN: RAVEN_CONFIG = {"dsn": RAVEN_DSN} else: # Preven raven from logging about how it's not configured... import logging raven_logger = logging.getLogger("raven.contrib.django.client.DjangoClient") raven_logger.setLevel(logging.CRITICAL) RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True"