示例#1
0
def get_and_configure_logger(name: str) -> logging.Logger:
    unconfigure_root_logger()

    global LOG_LEVEL
    if LOG_LEVEL is None:
        LOG_LEVEL = get_env_variable_gracefully("LOG_LEVEL", "INFO")

    # Set level to a environment variable; I think at least
    logger = daiquiri.getLogger(name)
    logger.setLevel(logging.getLevelName(LOG_LEVEL))

    # This is the local handler
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(
        daiquiri.formatter.ColorExtrasFormatter(fmt=FORMAT_STRING,
                                                keywords=[]))
    logger.logger.addHandler(handler)

    # This is the Sentry handler
    if "data_refinery_api" in name:
        raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False)
    else:
        raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False)
    if raven_dsn:
        from raven.contrib.django.handlers import SentryHandler

        handler = SentryHandler()
        handler.setFormatter(
            daiquiri.formatter.ColorExtrasFormatter(fmt=FORMAT_STRING,
                                                    keywords=[]))
        handler.setLevel(logging.WARNING)
        logger.logger.addHandler(handler)

    return logger
示例#2
0
 def process_response(self, request, response):
     response["X-Source-Revision"] = get_env_variable_gracefully(
         "SYSTEM_VERSION")
     # allow browsers to use the api directly and access the version
     # https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS#Access-Control-Expose-Headers
     response["Access-Control-Expose-Headers"] = "X-Source-Revision"
     return response
示例#3
0
def handle_survey_jobs(jobs: List[SurveyJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_survey_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more survey jobs now."
            )
            return False

    return True
示例#4
0
def handle_processor_jobs(jobs: List[ProcessorJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    # We want zebrafish data first, then hgu133plus2, then data
    # related to pediatric cancer, then to finish salmon experiments
    # that are close to completion.
    # Each function moves the jobs it prioritizes to the front of the
    # list, so apply them in backwards order.
    # jobs = prioritize_salmon_jobs(jobs)
    # jobs = prioritize_jobs_by_accession(jobs, PEDIATRIC_ACCESSION_LIST)
    # jobs = prioritize_jobs_by_accession(jobs, HGU133PLUS2_ACCESSION_LIST)
    # jobs = prioritize_zebrafish_jobs(jobs)

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_processor_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more processor jobs now."
            )
            return False

    return True
示例#5
0
    Organism,
    OrganismIndex,
    OriginalFile,
    Pipeline,
    Processor,
    ProcessorJob,
)
from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully
from data_refinery_workers.processors import utils


logger = get_and_configure_logger(__name__)
JOB_DIR_PREFIX = "processor_job_"
GENE_TO_TRANSCRIPT_TEMPLATE = "{gene_id}\t{transcript_id}\n"
GENE_TYPE_COLUMN = 2
S3_TRANSCRIPTOME_INDEX_BUCKET_NAME = get_env_variable_gracefully("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME", False)
LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store")
# Removes each occurrance of ; and "
IDS_CLEANUP_TABLE = str.maketrans({";": None, "\"": None})


def _compute_paths(job_context: Dict) -> str:
    """Computes the paths for all the directories used/created by this processor.

    Also computes a couple other path-based properties and adds them to the job_context.
    """
    # All files for the job are in the same directory.
    first_file_path = job_context["original_files"][0].absolute_file_path
    job_context["base_file_path"] = '/'.join(first_file_path.split('/')[:-1])
    job_context["work_dir"] = job_context["base_file_path"] + '/' + job_context["length"].upper() + '/' + \
                              JOB_DIR_PREFIX + str(job_context["job_id"]) + "/"
示例#6
0
# https://docs.djangoproject.com/en/2.2/topics/cache/

CACHES = {
    "default": {
        "BACKEND": "django.core.cache.backends.db.DatabaseCache",
        "LOCATION": "cache_table",
    }
}


# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False)
if raven_dsn:
    RAVEN_CONFIG = {"dsn": raven_dsn}
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger("raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)

RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True"

ELASTICSEARCH_DSL = {
    "default": {
        "hosts": get_env_variable("ELASTICSEARCH_HOST")
        + ":"
示例#7
0
    "rest_framework.pagination.LimitOffsetPagination",
    "PAGE_SIZE": 25,
    "DEFAULT_VERSIONING_CLASS": "rest_framework.versioning.URLPathVersioning",
}

SWAGGER_SETTINGS = {
    "USE_SESSION_AUTH": False,
    "SECURITY_DEFINITIONS": {},
}

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False)
if raven_dsn != "not set":
    RAVEN_CONFIG = {
        "dsn": raven_dsn,
        # Only send 5% of errors for the API, since we aren't going to
        # be interested in any single one.
        "sampleRate": 0.25,
    }
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger(
        "raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)
示例#8
0
    "DEFAULT_VERSIONING_CLASS": "rest_framework.versioning.URLPathVersioning",
    "EXCEPTION_HANDLER":
    "data_refinery_api.exceptions.custom_exception_handler",
}

SWAGGER_SETTINGS = {
    "USE_SESSION_AUTH": False,
    "SECURITY_DEFINITIONS": {},
}

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
raven_dsn = get_env_variable_gracefully("RAVEN_DSN_API", False)
if raven_dsn != "not set":
    RAVEN_CONFIG = {
        "dsn": raven_dsn,
        # Only send 5% of errors for the API, since we aren't going to
        # be interested in any single one.
        "sampleRate": 0.25,
    }
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger(
        "raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)
示例#9
0
 def process_response(self, request, response):
     response['X-Source-Revision'] = get_env_variable_gracefully(
         "SYSTEM_VERSION")
     return response
示例#10
0
USE_L10N = True

USE_TZ = True

# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.10/howto/static-files/

STATIC_URL = "/static/"

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN", None)

# AWS Secrets manager will not let us store an empty string.
if RAVEN_DSN == "None":
    RAVEN_DSN = None

if RAVEN_DSN:
    RAVEN_CONFIG = {"dsn": RAVEN_DSN}
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger(
        "raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)
示例#11
0
USE_L10N = True

USE_TZ = True

# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.10/howto/static-files/

STATIC_URL = '/static/'

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
raven_dsn = get_env_variable_gracefully('RAVEN_DSN', False)
if raven_dsn:
    RAVEN_CONFIG = {'dsn': raven_dsn}
else:
    # Preven raven from logging about how it's not configured...
    import logging
    raven_logger = logging.getLogger(
        'raven.contrib.django.client.DjangoClient')
    raven_logger.setLevel(logging.CRITICAL)

RUNNING_IN_CLOUD = get_env_variable('RUNNING_IN_CLOUD') == "True"

ELASTICSEARCH_DSL = {
    'default': {
        'hosts':
        get_env_variable('ELASTICSEARCH_HOST') + ":" +
示例#12
0
USE_L10N = True

USE_TZ = True

# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.10/howto/static-files/

STATIC_URL = "/static/"

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
raven_dsn = get_env_variable_gracefully("RAVEN_DSN", False)
if raven_dsn:
    RAVEN_CONFIG = {"dsn": raven_dsn}
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger(
        "raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)

RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True"

# EngagementBot
ENGAGEMENTBOT_WEBHOOK = get_env_variable_gracefully("ENGAGEMENTBOT_WEBHOOK")
示例#13
0
        "rest_framework.throttling.UserRateThrottle",
    ],
    "DEFAULT_THROTTLE_RATES": {"anon": "10/second", "user": "******"},
}

SWAGGER_SETTINGS = {
    "USE_SESSION_AUTH": False,
    "SECURITY_DEFINITIONS": {},
}

# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN_API", None)

# AWS Secrets manager will not let us store an empty string.
if RAVEN_DSN == "None":
    RAVEN_DSN = None

if RAVEN_DSN != "not set":
    RAVEN_CONFIG = {
        "dsn": RAVEN_DSN,
        # Only send 5% of errors for the API, since we aren't going to
        # be interested in any single one.
        "sampleRate": 0.25,
    }
else:
    # Preven raven from logging about how it's not configured...
    import logging
示例#14
0
# https://docs.djangoproject.com/en/2.2/topics/cache/

CACHES = {
    "default": {
        "BACKEND": "django.core.cache.backends.db.DatabaseCache",
        "LOCATION": "cache_table",
    }
}


# Setting the RAVEN_CONFIG when RAVEN_DSN isn't set will cause the
# following warning:
# /usr/local/lib/python3.6/site-packages/raven/conf/remote.py:91:
# UserWarning: Transport selection via DSN is deprecated. You should
# explicitly pass the transport class to Client() instead.
RAVEN_DSN = get_env_variable_gracefully("RAVEN_DSN", None)

# AWS Secrets manager will not let us store an empty string.
if RAVEN_DSN == "None":
    RAVEN_DSN = None

if RAVEN_DSN:
    RAVEN_CONFIG = {"dsn": RAVEN_DSN}
else:
    # Preven raven from logging about how it's not configured...
    import logging

    raven_logger = logging.getLogger("raven.contrib.django.client.DjangoClient")
    raven_logger.setLevel(logging.CRITICAL)

RUNNING_IN_CLOUD = get_env_variable("RUNNING_IN_CLOUD") == "True"