def reindex_all_files_to_process():
    """
    Totally clears the FilesToProcess DB, deletes all chunked files on S3,
    clears the ChunksRegistry DB, reads all relevant files on S3 to the
    FilesToProcess registry and then re-chunks them.
    """
    raise Exception(
        "This code has not been tested since converting database backends, that means 2018"
    )
    # Delete all preexisting FTP and ChunkRegistry objects
    FileProcessLock.lock()
    print('{!s} purging FileToProcess: {:d}'.format(
        datetime.now(), FileToProcess.objects.count()))
    FileToProcess.objects.all().delete()
    print('{!s} purging ChunkRegistry: {:d}'.format(
        datetime.now(), ChunkRegistry.objects.count()))
    ChunkRegistry.objects.all().delete()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2)

    # Delete all preexisting chunked data files
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print('{!s} deleting older chunked data: {:d}'.format(
        datetime.now(), len(CHUNKED_DATA)))
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    # Get a list of all S3 files to replace in the database
    print('{!s} pulling new files to process...'.format(datetime.now()))
    files_lists = pool.map(s3_list_files,
                           Study.objects.values_list('object_id', flat=True))

    # For each such file, create an FTP object
    print("putting new files to process...")
    for i, l in enumerate(files_lists):
        print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1,
                                                     Study.objects.count(),
                                                     len(l)))
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                patient_id = fp.split('/', 2)[1]
                participant_pk = Participant.objects.filter(
                    patient_id=patient_id).values_list('pk', flat=True).get()
                FileToProcess.append_file_for_processing(
                    fp, fp.split("/", 1)[0], participant_id=participant_pk)

    # Clean up by deleting large variables, closing the thread pool and unlocking the file process lock
    del files_lists, l
    pool.close()
    pool.terminate()
    FileProcessLock.unlock()

    # Rechunk the newly created FTPs
    print("{!s} processing data.".format(datetime.now()))
    process_file_chunks()
Exemplo n.º 2
0
def process_file_chunks():
    """
    This is the function that is called from the command line.  It runs through all new files
    that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising
    errors appropriately.
    This is primarily called manually during testing and debugging.
    """
    # Initialize the process and ensure there is no other process running at the same time
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()

    try:
        number_bad_files = 0

        # Get the list of participants with open files to process
        participants = Participant.objects.filter(
            files_to_process__isnull=False).distinct()
        print("processing files for the following users: %s" %
              ",".join(participants.values_list('patient_id', flat=True)))

        for participant in participants:
            while True:
                previous_number_bad_files = number_bad_files
                starting_length = participant.files_to_process.exclude(
                    deleted=True).count()

                print(
                    "%s processing %s, %s files remaining" %
                    (datetime.now(), participant.patient_id, starting_length))

                # Process the desired number of files and calculate the number of unprocessed files
                number_bad_files += do_process_user_file_chunks(
                    count=FILE_PROCESS_PAGE_SIZE,
                    error_handler=error_handler,
                    skip_count=number_bad_files,
                    participant=participant,
                )

                # If no files were processed, quit processing
                if (participant.files_to_process.exclude(deleted=True).count()
                        == starting_length
                        and previous_number_bad_files == number_bad_files):
                    # Cases:
                    #   every file broke, might as well fail here, and would cause infinite loop otherwise.
                    #   no new files.
                    break
    finally:
        FileProcessLock.unlock()

    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def reindex_specific_data_type(data_type):
    raise Exception(
        "This code has not been tested since converting database backends")
    FileProcessLock.lock()
    print("starting...")
    # Convert the data type; raise an error if something is wrong with it
    file_name_key = data_stream_to_s3_file_name_string(data_type)

    # Get all chunk paths of the given data type
    relevant_chunks = ChunkRegistry.objects.filter(data_type=data_type)
    # list() ensures that the QuerySet is evaluated before all of its elements are deleted (otherwise it would be empty)
    relevant_indexed_files = list(
        relevant_chunks.values_list('chunk_path', flat=True))

    # Delete the old ChunkRegistry objects
    print("purging old data...")
    relevant_chunks.delete()

    pool = ThreadPool(20)
    pool.map(s3_delete, relevant_indexed_files)

    print("pulling files to process...")
    files_lists = pool.map(s3_list_files,
                           Study.objects.values_list('object_id', flat=True))
    for i, l in enumerate(files_lists):
        print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1,
                                                     Study.objects.count(),
                                                     len(l)))
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                patient_id = fp.split('/', 2)[1]
                participant_pk = Participant.objects.filter(
                    patient_id=patient_id).values_list('pk', flat=True).get()
                FileToProcess.append_file_for_processing(
                    fp, fp.split("/", 1)[0], participant_id=participant_pk)

    del files_lists, l
    pool.close()
    pool.terminate()
    FileProcessLock.unlock()

    print("{!s} processing data.".format(datetime.now()))
    process_file_chunks()
    print("Done.")
Exemplo n.º 4
0
def process_file_chunks_lambda():
    """
    This is the function that is called from the command line.  It runs through all new files
    that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising
    errors appropriately.
    This is primarily called manually during testing and debugging.
    """
    # Initialize the process and ensure there is no other process running at the same time
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()

    try:
        number_bad_files = 0

        # Get the list of participants with open files to process
        participants = Participant.objects.filter(
            files_to_process__isnull=False).distinct()
        print("processing files for the following users: %s" %
              ",".join(participants.values_list('patient_id', flat=True)))

        for participant in participants:
            for fp in participant.files_to_process.all():
                print(fp.s3_file_path)
                event = {
                    'Records': [{
                        's3': {
                            'object': {
                                'key': fp.s3_file_path
                            }
                        }
                    }]
                }

                chunk_file_lambda_handler(event, [])

    finally:
        FileProcessLock.unlock()

    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def report_file_processing_locked_and_exit():
    """ Creates a useful error report with information about the run time. """
    timedelta_since_last_run = FileProcessLock.get_time_since_locked()
    print("timedelta %s" % timedelta_since_last_run.total_seconds())
    if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS:
        error_msg = (
            "Data processing has overlapped with a prior data index run that started more than "
            "%s minutes ago.\nThat prior run has been going for %s hour(s), %s minute(s)"
        )
        error_msg = error_msg % (CELERY_ERROR_REPORT_TIMEOUT_SECONDS / 60,
                                 str(int(timedelta_since_last_run.total_seconds() / 60 / 60)),
                                 str(int(timedelta_since_last_run.total_seconds() / 60 % 60)))
        
        if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS * 4:
            error_msg = "DATA PROCESSING OVERLOADED, CHECK SERVER.\n" + error_msg
            email_system_administrators(error_msg, "DATA PROCESSING OVERLOADED, CHECK SERVER")
        raise ProcessingOverlapError(error_msg)
Exemplo n.º 6
0
    parser.add_argument('--delete_survey', help='Removes all of the surveys for a specified study.',
        nargs=1, type=str)

    parser.add_argument('--get_survey', help='Use the mobile API to retrieve all surveys for participant.',
        nargs=1, type=str)

    parser.add_argument('--create_participant_survey', help='Create a survey for a participant using the contents of a json file',
        nargs=2, type=str)

    parser.add_argument('--send_participant_message', help='Create a survey for a participant with a single informational text question that includes the given string',
        nargs=2, type=str)

    args = parser.parse_args()

    if args.unlock_fileprocessing_lock:
        FileProcessLock.unlock()
        print('Unlocked')

    if args.write_survey_config:

        try:
            study = Study.objects.get(pk=int(args.write_survey_config[0]))
        except Study.DoesNotExist:
            print("Could not find study {0}".format(args.write_survey_config[0]))
            raise

        survey_config = {}

        for survey in study.surveys.filter(deleted=False): 

            if survey.deleted:
Exemplo n.º 7
0
def create_file_processing_tasks():
    # The entire code is wrapped in an ErrorSentry, which catches any errors
    # and sends them to Sentry.
    with make_error_sentry('data') as error_sentry:
        print(error_sentry.sentry_client.is_enabled())
        if FileProcessLock.islocked():
            # This is really a safety check to ensure that no code executes
            # if file processing is locked.
            report_file_processing_locked_and_exit()
            # report_file_processing_locked should raise an error; this should be unreachable
            exit(0)
        else:
            FileProcessLock.lock()

        print("starting.")
        now = datetime.now()
        expiry = now + timedelta(minutes=CELERY_EXPIRY_MINUTES)
        participant_set = Participant.objects.filter(
            files_to_process__isnull=False).distinct().values_list("id",
                                                                   flat=True)
        running = []

        for participant_id in participant_set:
            # Queue all users' file processing, and generate a list of currently running jobs
            # to use to detect when all jobs are finished running.
            running.append(
                safe_queue_user(args=[participant_id],
                                max_retries=0,
                                expires=expiry,
                                task_track_started=True,
                                task_publish_retry=False,
                                retry=False))

        print("tasks:", running)

        # If there are any Celery tasks still running, check their state and update the running
        # list accordingly. Do this every 5 seconds.
        while running:
            new_running = []
            failed = []
            successful = []
            for future in running:
                ####################################################################################
                # This variable can mutate on a separate thread.  We need the value as it was at
                # this snapshot in time, so we store it.  (The object is a string, passed by value.)
                ####################################################################################
                state = future.state
                if state == SUCCESS:
                    successful.append(future)
                if state in FAILED:
                    failed.append(future)
                if state in STARTED_OR_WAITING:
                    new_running.append(future)

            running = new_running
            print("tasks:", running)
            if running:
                sleep(5)

        print("Finished, unlocking.")
        # The unlocking MUST be **inside** the with statement.
        FileProcessLock.unlock()