Пример #1
0
def run_manual_code(study_id):
    """
    Create an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """
    # we assume that the cluster is configured only in one region.
    pipeline_region = get_current_region()

    # Get the object ID of the study, used in the pipeline
    query = Study.objects.filter(pk=study_id)
    if not query.exists():
        return abort(404)
    object_id = query.get().object_id

    error_sentry = make_error_sentry("data",
                                     tags={"pipeline_frequency": "manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        ssm_client = get_boto_client('ssm', pipeline_region)
        refresh_data_access_credentials('manually',
                                        ssm_client=ssm_client,
                                        webserver=True)
        batch_client = get_boto_client('batch', pipeline_region)
        create_one_job('manually', object_id, batch_client, webserver=True)
        flash('Data pipeline code successfully initiated!', 'success')

    if error_sentry.errors:
        flash('An unknown error occurred when trying to run this task.',
              category='danger')
        print error_sentry

    return redirect('/data-pipeline/{:s}'.format(study_id))
def terminate_pipeline(study_id):
    """
    Terminate an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """

    username = session["admin_username"]

    pipeline_id = request.values['pipeline_id']
    flash('terminating pipeline {0}'.format(pipeline_id))

    error_sentry = make_error_sentry(
        "data", tags={"pipeline_frequency": "terminate_job manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        batch_client = get_boto_client('batch', pipeline_region)
        terminate_job(pipeline_id, username, batch_client)

    if error_sentry.errors:
        flash(
            'An error occurred when trying to terminate the pipeline {0}: {1}'.
            format(pipeline_id, error_sentry),
            category='danger')
        print(error_sentry)
    else:
        flash('Pipeline {0} terminated.'.format(pipeline_id), 'success')

    return redirect('/data-pipeline/{:s}'.format(study_id))
def run_manual_code(study_id):
    """
    Create an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """

    # Get the object ID of the study, used in the pipeline
    object_id = Study.objects.get(pk=study_id).object_id
    error_sentry = make_error_sentry("data",
                                     tags={"pipeline_frequency": "manually"})

    with error_sentry:
        # Get new data access credentials for the manual user
        aws_object_names = get_aws_object_names()
        refresh_data_access_credentials('manually', aws_object_names)

        # Submit a manual job
        create_one_job('manually', object_id)

        # The success message gets displayed to the user upon redirect
        flash('Data pipeline code successfully initiated!', 'success')

    if error_sentry.errors:
        flash('An unknown error occurred when trying to run this task.',
              'danger')

    return redirect('/data-pipeline/{:s}'.format(study_id))
Пример #4
0
def create_push_notification_tasks():
    # we reuse the high level strategy from data processing celery tasks, see that documentation.
    expiry = (datetime.utcnow() + timedelta(minutes=5)).replace(second=30, microsecond=0)
    now = timezone.now()
    surveys, schedules, patient_ids = get_surveys_and_schedules(now)
    print(surveys)
    print(schedules)
    print(patient_ids)
    with make_error_sentry(sentry_type=SentryTypes.data_processing):
        if not check_firebase_instance():
            print("Firebase is not configured, cannot queue notifications.")
            return

        # surveys and schedules are guaranteed to have the same keys, assembling the data structures
        # is a pain, so it is factored out. sorry, but not sorry. it was a mess.
        for fcm_token in surveys.keys():
            print(f"Queueing up push notification for user {patient_ids[fcm_token]} for {surveys[fcm_token]}")
            safe_queue_push(
                args=[fcm_token, surveys[fcm_token], schedules[fcm_token]],
                max_retries=0,
                expires=expiry,
                task_track_started=True,
                task_publish_retry=False,
                retry=False,
            )
Пример #5
0
def celery_process_file_chunks(participant_id):
    """
    This is the function that is called from celery.  It runs through all new files that have
    been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors
    appropriately.
    This runs automatically and periodically as a Celery task.
    """
    participant = Participant.objects.get(id=participant_id)
    log = LogList()
    number_bad_files = 0
    tags = {'user_id': participant.patient_id}
    error_sentry = make_error_sentry('data', tags=tags)
    log.append("processing files for %s" % participant.patient_id)

    while True:
        previous_number_bad_files = number_bad_files
        starting_length = participant.files_to_process.exclude(
            deleted=True).count()

        log.append("%s processing %s, %s files remaining" %
                   (datetime.now(), participant.patient_id, starting_length))
        number_bad_files += do_process_user_file_chunks(
            count=FILE_PROCESS_PAGE_SIZE,
            error_handler=error_sentry,
            skip_count=number_bad_files,
            participant=participant,
        )

        # If no files were processed, quit processing
        if participant.files_to_process.exclude(
                deleted=True).count() == starting_length:
            if previous_number_bad_files == number_bad_files:
                # Cases:
                #   every file broke, blow up. (would cause infinite loop otherwise)
                #   no new files.
                break
            else:
                continue

    with make_error_sentry('data', tags=tags):
        error_sentry.raise_errors()
Пример #6
0
def celery_process_file_chunks(participant_id):
    """ This is the function is queued up, it runs through all new uploads from a specific user and
    'chunks' them. Handles logic for skipping bad files, raising errors. """

    # celery doesn't clean up after itself very well, either memory or open network connections.
    # this probably has something to do with the fact that celery forks, so possibly picking
    # a different mode would impact this.  Or we can just exit the python process.
    try:
        time_start = datetime.now()
        participant = Participant.objects.get(id=participant_id)

        number_bad_files = 0
        tags = {'user_id': participant.patient_id}
        error_sentry = make_error_sentry(
            sentry_type=SentryTypes.data_processing)
        print("processing files for %s" % participant.patient_id)

        while True:
            previous_number_bad_files = number_bad_files
            starting_length = participant.files_to_process.exclude(
                deleted=True).count()

            print("%s processing %s, %s files remaining" %
                  (datetime.now(), participant.patient_id, starting_length))
            number_bad_files += do_process_user_file_chunks(
                page_size=FILE_PROCESS_PAGE_SIZE,
                error_handler=error_sentry,
                position=number_bad_files,
                participant=participant,
            )
            # If no files were processed, quit processing
            if participant.files_to_process.exclude(
                    deleted=True).count() == starting_length:
                if previous_number_bad_files == number_bad_files:
                    # 2 Cases:
                    #   1) every file broke, blow up. (would cause infinite loop otherwise).
                    #   2) no new files.
                    break
                else:
                    continue

            # put maximum time limit per user
            if (time_start - datetime.now()).total_seconds() > 60 * 60 * 3:
                break

    finally:
        print(
            "IGNORE 'ConnectionResetError: [Errno 104] Connection reset by peer'\n"
            "WE EXIT IN ORDER TO FIX A MEMORY LEAK THAT SO FAR DEFIES ANALYSIS. CELERY COMPLAINS."
        )
        exit(0)
def send_android_error_report(user_id, error_report):
    # Encountered a corrupted (write error) error report upload on Apr 30 2017, adding error sentry
    # so that we get *some* report of the error occuring but also delete that file from the device.
    with make_error_sentry('android'):
        # get all non-empty lines in the error report
        contents = [line for line in error_report.splitlines() if line.strip()]
        
        if not contents:
            # just short circuit.  We don't know why this happens, but it happens.
            return
        
        # the first line contains a unix millisecond timestamp, construct a datetime
        # The printed value in the crash report is in UTC
        try:  # Beiwe version greater than 4
            timestamp = datetime.fromtimestamp(float(contents[0]) / 1000)
            contents.pop(0)  # remove timestamp from message text
        except ValueError:  # Beiwe version 4
            timestamp = datetime.fromtimestamp(float(request.values['file_name'].split("_")[1]) / 1000)
        
        device_identifiers = contents[0].split(',')
        contents.pop(0)  # remove device identifiers from message text
    
        # Insert the actual error message as the first line
        report_title = contents[0].split(":", 1)[1].strip()
        if "}" in report_title:  #cut title at end of file name
            report_title = report_title.split("}", 1)[0] + "}"
        contents.insert(0, "Android Error: %s" % report_title)
        
        # the second line contains all the identifiers. Clean it up and parse into a dictionary.
        device_identifiers = {ID.strip().split(":",1)[0] : ID.strip().split(":",1)[1]
                                              for ID in device_identifiers}
    
        # get a useful timestamp...
        eastern_time = timestamp.replace(tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('America/New_York'))
        
        # construct some useful tags for this error report, add all identifiers as tags.
        tags = {"Android_Error": "android error",
                "user_id": user_id,
                "date": str(timestamp.date()),
                "time": str(timestamp).split(" ")[1].split(".")[0],
                "eastern_date": str(eastern_time.date()),
                "eastern_time": str(eastern_time).split(" ")[1].split(".")[0]
                }
        tags.update(device_identifiers)
        
        sentry_client = make_sentry_client('android', tags)
    
        sentry_client.captureMessage("\n".join(contents))
Пример #8
0
def celery_send_push_notification(fcm_token: str, survey_obj_ids: List[str], schedule_pks: List[int]):
    ''' Celery task that sends push notifications.   Note that this list of pks may contain duplicates.'''
    patient_id = ParticipantFCMHistory.objects.filter(token=fcm_token) \
        .values_list("participant__patient_id", flat=True).get()

    with make_error_sentry(sentry_type=SentryTypes.data_processing):
        if not check_firebase_instance():
            print("Firebase credentials are not configured.")
            return

        # use the earliest timed schedule as our reference for the sent_time parameter.  (why?)
        participant = Participant.objects.get(patient_id=patient_id)
        schedules = ScheduledEvent.objects.filter(pk__in=schedule_pks)
        reference_schedule = schedules.order_by("scheduled_time").first()
        survey_obj_ids = list(set(survey_obj_ids))  # already deduped; whatever.

        print(f"Sending push notification to {patient_id} for {survey_obj_ids}...")
        try:
            send_push_notification(participant, reference_schedule, survey_obj_ids, fcm_token)
        # error types are documented at firebase.google.com/docs/reference/fcm/rest/v1/ErrorCode
        except UnregisteredError as e:
            # is an internal 404 http response, it means the token used was wrong.
            # mark the fcm history as out of date.
            return

        except QuotaExceededError:
            # limits are very high, this is effectively impossible, but it is possible, so we catch it.
            raise

        except ThirdPartyAuthError as e:
            failed_send_handler(participant, fcm_token, str(e), schedules)
            # This means the credentials used were wrong for the target app instance.  This can occur
            # both with bad server credentials, and with bad device credentials.
            # We have only seen this error statement, error name is generic so there may be others.
            if str(e) != "Auth error from APNS or Web Push Service":
                raise
            return

        except ValueError as e:
            # This case occurs ever? is tested for in check_firebase_instance... weird race condition?
            # Error should be transient, and like all other cases we enqueue the next weekly surveys regardless.
            if "The default Firebase app does not exist" in str(e):
                enqueue_weekly_surveys(participant, schedules)
                return
            else:
                raise

        success_send_handler(participant, fcm_token, schedules)
def create_file_processing_tasks():
    """ Generates tasks to enqueue.  This is called every 6 minutes, and tasks have a lifetime
    of 6 minutes.  Note that tasks are not removed from the queue by RabbitMQ, but by Celery.
    inspecting the queue will continue to display the tasks that have not been sent to Celery
    until the most recent job is finished.

    Also, for some reason 5 minutes is the smallest value that .... works.  At all.
    No clue why.
    """

    # set the tasks to expire at the 5 minutes and thirty seconds mark after the most recent
    # 6 minutely cron task. This way all tasks will be revoked at the same, and well-known, instant.
    # 30 seconds grace period is 30 seconds out of
    expiry = (datetime.now() + timedelta(minutes=5)).replace(second=30, microsecond=0)

    with make_error_sentry('data'):
        participant_set = set(
            Participant.objects.filter(files_to_process__isnull=False)
                .distinct()
                # .order_by("id")  # For debugging, forces overlap conflicts.
                .order_by("?")     # don't want a single user blocking everyone because they are at the front.
                .values_list("id", flat=True)
        )
        
        # sometimes celery just fails to exist.
        active_set = set(celery_try_20_times(get_active_job_ids))
        
        participants_to_process = participant_set - active_set
        print("Queueing these participants:", ",".join(str(p) for p in participants_to_process))

        for participant_id in participants_to_process:
            # Queue all users' file processing, and generate a list of currently running jobs
            # to use to detect when all jobs are finished running.
            safe_queue_user(
                args=[participant_id],
                max_retries=0,
                expires=expiry,
                task_track_started=True,
                task_publish_retry=False,
                retry=False
            )
        print(f"{len(participants_to_process)} users queued for processing")
def batch_upload(upload: Tuple[ChunkRegistry or dict, str, bytes, str]):
    """ Used for mapping an s3_upload function.  the tuple is unpacked, can only have one parameter. """

    ret = {'exception': None, 'traceback': None}
    with make_error_sentry(sentry_type=SentryTypes.data_processing):
        try:
            chunk, chunk_path, new_contents, study_object_id = upload
            del upload
            new_contents = decompress(new_contents)

            if "b'" in chunk_path:
                raise Exception(chunk_path)

            # for use with test script to avoid network uploads
            # with open("processing_tests/" + GLOBAL_TIMESTAMP, 'ba') as f:
            #     f.write(b"\n\n")
            #     f.write(new_contents)
            #     return ret

            s3_upload(chunk_path, new_contents, study_object_id, raw_path=True)

            # if the chunk object is a chunk registry then we are updating an old one,
            # otherwise we are creating a new one.
            if isinstance(chunk, ChunkRegistry):
                # If the contents are being appended to an existing ChunkRegistry object
                chunk.file_size = len(new_contents)
                chunk.update_chunk(new_contents)
            else:
                ChunkRegistry.register_chunked_data(**chunk,
                                                    file_contents=new_contents)

        # it broke. print stacktrace for debugging
        except Exception as e:
            traceback.print_exc()
            ret['traceback'] = sys.exc_info()
            ret['exception'] = e

            # using an error sentry we can easily report a real error with a real stack trace! :D
            raise

    return ret
Пример #11
0
def create_all_jobs(freq):
    """
    Create one AWS batch job for each Study object
    :param freq: string e.g. 'daily', 'monthly'
    """

    # TODO: Boto3 version 1.4.8 has AWS Batch Array Jobs, which are extremely useful for the
    # task this function performs. We should switch to using them.

    # Get new data access credentials for the user
    aws_object_names = get_aws_object_names()
    refresh_data_access_credentials(freq, aws_object_names)

    # TODO: If there are issues with servers not getting spun up in time, make this a
    # ThreadPool with random spacing over the course of 5-10 minutes.
    error_sentry = make_error_sentry("data", tags={"pipeline_frequency": freq})
    for study in Study.objects.filter(deleted=False):
        with error_sentry:
            # For each study, create a job
            object_id = study.object_id
            create_one_job(freq, object_id, aws_object_names)
Пример #12
0
def create_file_processing_tasks():
    # The entire code is wrapped in an ErrorSentry, which catches any errors
    # and sends them to Sentry.
    with make_error_sentry('data') as error_sentry:
        print(error_sentry.sentry_client.is_enabled())
        if FileProcessLock.islocked():
            # This is really a safety check to ensure that no code executes
            # if file processing is locked.
            report_file_processing_locked_and_exit()
            # report_file_processing_locked should raise an error; this should be unreachable
            exit(0)
        else:
            FileProcessLock.lock()

        print("starting.")
        now = datetime.now()
        expiry = now + timedelta(minutes=CELERY_EXPIRY_MINUTES)
        participant_set = Participant.objects.filter(
            files_to_process__isnull=False).distinct().values_list("id",
                                                                   flat=True)
        running = []

        for participant_id in participant_set:
            # Queue all users' file processing, and generate a list of currently running jobs
            # to use to detect when all jobs are finished running.
            running.append(
                safe_queue_user(args=[participant_id],
                                max_retries=0,
                                expires=expiry,
                                task_track_started=True,
                                task_publish_retry=False,
                                retry=False))

        print("tasks:", running)

        # If there are any Celery tasks still running, check their state and update the running
        # list accordingly. Do this every 5 seconds.
        while running:
            new_running = []
            failed = []
            successful = []
            for future in running:
                ####################################################################################
                # This variable can mutate on a separate thread.  We need the value as it was at
                # this snapshot in time, so we store it.  (The object is a string, passed by value.)
                ####################################################################################
                state = future.state
                if state == SUCCESS:
                    successful.append(future)
                if state in FAILED:
                    failed.append(future)
                if state in STARTED_OR_WAITING:
                    new_running.append(future)

            running = new_running
            print("tasks:", running)
            if running:
                sleep(5)

        print("Finished, unlocking.")
        # The unlocking MUST be **inside** the with statement.
        FileProcessLock.unlock()
def run_manual_code(study_id):
    """
    Create an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """

    username = session["admin_username"]

    destination_email_addresses_string = ''
    if 'destination_email_addresses' in request.values:
        destination_email_addresses_string = request.values[
            'destination_email_addresses']
        destination_email_addresses = [
            d.strip() for d in filter(
                None, re.split("[, \?:;]+",
                               destination_email_addresses_string))
        ]
        for email_address in destination_email_addresses:
            if not validate_email(email_address):
                flash(
                    'Email address {0} in ({1}) does not appear to be a valid email address.'
                    .format(email_address, destination_email_addresses_string),
                    category='danger')
                return redirect('/data-pipeline/{:s}'.format(study_id))
        destination_email_addresses_string = ','.join(
            destination_email_addresses)

    participants_string = ''
    if 'participants' in request.values:
        participants_string = request.form.getlist('participants')
        participants_string = ','.join(participants_string)

    data_start_time = ''
    if 'time_start' in request.values:
        data_start_time = request.values['time_start']

    data_end_time = ''
    if 'time_end' in request.values:
        data_end_time = request.values['time_end']

    # Get the object ID of the study, used in the pipeline
    query = Study.objects.filter(pk=study_id)
    if not query.exists():
        flash('Could not find study corresponding to study id {0}'.format(
            study_id),
              category='danger')
        return redirect('/data-pipeline/{:s}'.format(study_id))
        #return abort(404)
    object_id = query.get().object_id

    pipeline_region = os.getenv("pipeline_region", None)
    if not pipeline_region:
        pipeline_region = 'us-east-1'
        flash('Pipeline region not configured, choosing default ({})'.format(
            pipeline_region),
              category='info')
        # return redirect('/data-pipeline/{:s}'.format(study_id))

    error_sentry = make_error_sentry("data",
                                     tags={"pipeline_frequency": "manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        ssm_client = get_boto_client('ssm', pipeline_region)
        refresh_data_access_credentials('manually', ssm_client=ssm_client)
        batch_client = get_boto_client('batch', pipeline_region)
        create_one_job('manually', object_id, username,
                       destination_email_addresses_string, data_start_time,
                       data_end_time, participants_string, batch_client)

        if data_start_time and data_end_time:
            flash(
                'Data pipeline successfully initiated on data collected between {0} and {1}! Email(s) will be sent to {2} on completion.'
                .format(data_start_time, data_end_time,
                        destination_email_addresses), 'success')
        elif data_start_time:
            flash(
                'Data pipeline successfully initiated on data collected after {0}! Email(s) will be sent to {1} on completion.'
                .format(data_start_time,
                        destination_email_addresses), 'success')
        elif data_end_time:
            flash(
                'Data pipeline successfully initiated on data collected before {0}! Email(s) will be sent to {1} on completion.'
                .format(data_start_time,
                        destination_email_addresses), 'success')
        else:
            flash(
                'Data pipeline successfully initiated! Email(s) will be sent to {0} on completion.'
                .format(destination_email_addresses), 'success')

    if error_sentry.errors:
        flash('An error occurred when trying to execute the pipeline: {0}'.
              format(error_sentry),
              category='danger')
        print(error_sentry)

    return redirect('/data-pipeline/{:s}'.format(study_id))
Пример #14
0
_path.insert(1, _one_folder_up)

from datetime import timedelta

from django.utils import timezone

from database.data_access_models import ChunkRegistry
from database.study_models import Study
from libs.sentry import make_error_sentry
from pipeline.boto_helpers import get_boto_client
from pipeline.configuration_getters import get_current_region
from pipeline.index import create_one_job, refresh_data_access_credentials

pipeline_region = get_current_region()
ssm_client = get_boto_client('ssm', pipeline_region)
error_sentry = make_error_sentry("data",
                                 tags={"pipeline_frequency": "manually"})
batch_client = get_boto_client('batch', pipeline_region)
yesterday = timezone.now() - timedelta(days=1)

refresh_data_access_credentials('manually',
                                ssm_client=ssm_client,
                                webserver=False)

################################################################################################
# if you are running this on an ubuntu machine you have to sudo apt-get -y install cloud-utils #
################################################################################################

for study in Study.objects.all():
    with error_sentry:
        # we only want to run the pipeline for data that has been uploaded, Report all errors to sentry
        for patient_id in ChunkRegistry.get_updated_users_for_study(