Пример #1
0
def get_output_files(job_id):
    """
    Get the current list of output files from the output directory.

    This command trusts that whatever is in the output directory at the
    time it is called is the correct list of output files.

    parameter:
    job_id, integer

    returns: list of JSAProcFileInfo objects.
    Each object contains a plain filename, with no path attached.
    """

    # find output_dir
    output_dir = get_output_dir(job_id)

    # Check it exists and is a directory: raise error if not
    if not os.path.exists(output_dir) or not os.path.isdir:
        raise JSAProcError(
            'The output directory %s for job %i does not exits' %
            (output_dir, job_id))

    # Get list of files in directory:
    contents = os.listdir(output_dir)

    return [JSAProcFileInfo(x, get_md5sum(os.path.join(output_dir, x)))
            for x in contents]
Пример #2
0
def get_output_files(job_id):
    """
    Get the current list of output files from the output directory.

    This command trusts that whatever is in the output directory at the
    time it is called is the correct list of output files.

    parameter:
    job_id, integer

    returns: list of JSAProcFileInfo objects.
    Each object contains a plain filename, with no path attached.
    """

    # find output_dir
    output_dir = get_output_dir(job_id)

    # Check it exists and is a directory: raise error if not
    if not os.path.exists(output_dir) or not os.path.isdir:
        raise JSAProcError(
            'The output directory %s for job %i does not exist' %
            (output_dir, job_id))

    # Get list of files in directory:
    contents = os.listdir(output_dir)

    return [JSAProcFileInfo(x, get_md5sum(os.path.join(output_dir, x)))
            for x in contents]
Пример #3
0
def assemble_parent_data_for_job(
        job_id, parent_job_id, parent_files, force_new=False):
    """
    This routine ensures that all the input data from parent jobs is
    available for running a job.

    It takes in the current job_id, the job_id of the parent job
    it is assembling data for, and the list of parent_files it
    needs to find for that job.

    option 'force_new' will force this function to ignore data already in the
    input file directory

    It will first of look in the output data directory for the parent job,
    and if not there it will download the file from CADC.
    """

    input_directory = setup_input_directory(job_id)

    # Output directory for parent jobs.
    dirpath = get_output_dir(parent_job_id)

    # List to hold full paths to input files.
    files_list = []
    for f in parent_files:

        # First of all check if file is already in input directory.
        filepath = is_file_in_a_dir(f, input_directory)
        if filepath and not force_new:
            files_list.append(filepath)
        else:

            # Then check if file is in parent output directory, copy
            # it in if so.
            logger.debug('Parent file: %s', f)
            logger.debug('dirpath = %s', dirpath)
            filepath = is_file_in_a_dir(f, dirpath)
            logger.debug('filepath = %s', filepath)
            if filepath:
                shutil.copy(filepath, input_directory)
                filepath = os.path.join(input_directory,
                                        os.path.split(filepath)[1])
                files_list.append(filepath)

            else:
                filepath = fetch_cadc_file(f, input_directory)
                valid = valid_hds(filepath)
                # If downloaded file is not valid:
                if not valid:
                    # Move invalid file to different directory and raise an
                    # error.
                    invalid_dir = setup_invalid_dir(input_directory)
                    invalid_file = os.path.join(invalid_dir,
                                                os.path.split(filepath)[1])
                    shutil.move(filepath, invalid_file)
    return files_list
Пример #4
0
def assemble_parent_data_for_job(
        job_id, parent_job_id, parent_files, force_new=False):
    """
    This routine ensures that all the input data from parent jobs is
    available for running a job.

    It takes in the current job_id, the job_id of the parent job
    it is assembling data for, and the list of parent_files it
    needs to find for that job.

    option 'force_new' will force this function to ignore data already in the
    input file directory

    It will first of look in the output data directory for the parent job,
    and if not there it will download the file from CADC.
    """

    input_directory = setup_input_directory(job_id)

    # Output directory for parent jobs.
    dirpath = get_output_dir(parent_job_id)

    # List to hold full paths to input files.
    files_list = []
    for f in parent_files:

        # First of all check if file is already in input directory.
        filepath = is_file_in_a_dir(f, input_directory)
        if filepath and not force_new:
            files_list.append(filepath)
        else:

            # Then check if file is in parent output directory, copy
            # it in if so.
            logger.debug('Parent file: %s', f)
            logger.debug('dirpath = %s', dirpath)
            filepath = is_file_in_a_dir(f, dirpath)
            logger.debug('filepath = %s', filepath)
            if filepath:
                shutil.copy(filepath, input_directory)
                filepath = os.path.join(input_directory,
                                        os.path.split(filepath)[1])
                files_list.append(filepath)

            else:
                filepath = fetch_cadc_file(f, input_directory)
                valid = valid_hds(filepath)
                # If downloaded file is not valid:
                if not valid:
                    # Move invalid file to different directory and raise an
                    # error.
                    invalid_dir = setup_invalid_dir(input_directory)
                    invalid_file = os.path.join(invalid_dir,
                                                os.path.split(filepath)[1])
                    shutil.move(filepath, invalid_file)
    return files_list
Пример #5
0
def check_data_already_present(job_id, db):
    """
    Check if all data are present already on disk,
    outside of the input directory.

    This is intended to be run by the statemachine.

    If all data is present, it will return a list with
    each input file (from input table or parent table) and
    its full path.

    It does not check the input directory for this job
    for files.

    It does not copy any files into the input directory.
    """

    try:
        input_file_list = db.get_input_files(job_id)
        inputs = get_jac_input_data(input_file_list)
    except NoRowsError:
        inputs = []

    try:
        parents = db.get_parents(job_id, with_state=True)
        parent_files_with_paths = []
        for p, filts, parent_state in parents:
            if parent_state not in JSAProcState.STATE_POST_RUN:
                raise ParentNotReadyError('Parent job {} is not ready'.format(p))

            outputs = db.get_output_files(p)
            parent_files = filter_file_list(outputs, filts)
            for f in parent_files:
                filepath = is_file_in_a_dir(f, get_output_dir(p))
                if not filepath:
                    raise NotAtJACError(f)
                else:
                    inputs.append(filepath)
    except NoRowsError:
        pass
    return inputs
Пример #6
0
def check_data_already_present(job_id, db):
    """
    Check if all data are present already on disk,
    outside of the input directory.

    This is intended to be run by the statemachine.

    If all data is present, it will return a list with
    each input file (from input table or parent table) and
    its full path.

    It does not check the input directory for this job
    for files.

    It does not copy any files into the input directory.
    """

    try:
        input_file_list = db.get_input_files(job_id)
        inputs = get_jac_input_data(input_file_list)
    except NoRowsError:
        inputs = []

    try:
        parents = db.get_parents(job_id, with_state=True)
        parent_files_with_paths = []
        for p, filts, parent_state in parents:
            if parent_state not in JSAProcState.STATE_POST_RUN:
                raise ParentNotReadyError('Parent job {} is not ready'.format(p))

            outputs = db.get_output_files(p)
            parent_files = filter_file_list(outputs, filts)
            for f in parent_files:
                filepath = is_file_in_a_dir(f, get_output_dir(p))
                if not filepath:
                    raise NotAtJACError(f)
                else:
                    inputs.append(filepath)
    except NoRowsError:
        pass
    return inputs
Пример #7
0
def prepare_job_preview(job_id, preview, type_='png'):
    """
    Prepare a preview image for a job.

    Return the path to the preview image
    """

    valid_preview = valid_preview_patterns.get(type_)

    if valid_preview is None:
        raise HTTPError('Unexpected preview type requested')

    if not valid_preview.match(preview):
        raise HTTPError('Invalid preview filename')

    preview_path = os.path.join(get_output_dir(job_id), preview)

    if not os.path.exists(preview_path):
        raise HTTPNotFound()

    return preview_path
Пример #8
0
def prepare_job_preview(job_id, preview, type_='png'):
    """
    Prepare a preview image for a job.

    Return the path to the preview image
    """

    valid_preview = valid_preview_patterns.get(type_)

    if valid_preview is None:
        raise HTTPError('Unexpected preview type requested')

    if not valid_preview.match(preview):
        raise HTTPError('Invalid preview filename')

    preview_path = os.path.join(get_output_dir(job_id), preview)

    if not os.path.exists(preview_path):
        raise HTTPNotFound()

    return preview_path
Пример #9
0
    def test_directories(self):
        # Test all the directory functions.
        self.assertEqual(
            get_input_dir(18),
            '/net/kamaka/export/data/jsa_proc/input/000/000000/000000018')

        self.assertEqual(
            get_output_dir(46),
            '/net/kamaka/export/data/jsa_proc/output/000/000000/000000046')

        self.assertEqual(
            get_scratch_dir(92),
            '/export/data/jsa_proc/scratch/000/000000/000000092')

        self.assertEqual(
            get_log_dir(844),
            '/net/kamaka/export/data/jsa_proc/log/000/000000/000000844')

        # Test longer job ID numbers (we know all the functions use the same
        # private function to prepare the decimal number internally).
        self.assertEqual(
            get_log_dir(123456789),
            '/net/kamaka/export/data/jsa_proc/log/123/123456/123456789')

        self.assertEqual(
            get_log_dir(22333),
            '/net/kamaka/export/data/jsa_proc/log/000/000022/000022333')

        self.assertEqual(
            get_log_dir(22333999),
            '/net/kamaka/export/data/jsa_proc/log/022/022333/022333999')

        # Test what happens with a billion or more job IDs.
        self.assertEqual(
            get_log_dir(1999000999),
            '/net/kamaka/export/data/jsa_proc/log/1999/1999000/1999000999')

        with self.assertRaises(JSAProcError):
            get_input_dir('not an integer')
Пример #10
0
    def test_directories(self):
        # Test all the directory functions.
        self.assertEqual(
            get_input_dir(18),
            '/net/kamaka/export/data/jsa_proc/input/000/000000/000000018')

        self.assertEqual(
            get_output_dir(46),
            '/net/kamaka/export/data/jsa_proc/output/000/000000/000000046')

        self.assertEqual(get_scratch_dir(92),
                         '/export/data/jsa_proc/scratch/000/000000/000000092')

        self.assertEqual(
            get_log_dir(844),
            '/net/kamaka/export/data/jsa_proc/log/000/000000/000000844')

        # Test longer job ID numbers (we know all the functions use the same
        # private function to prepare the decimal number internally).
        self.assertEqual(
            get_log_dir(123456789),
            '/net/kamaka/export/data/jsa_proc/log/123/123456/123456789')

        self.assertEqual(
            get_log_dir(22333),
            '/net/kamaka/export/data/jsa_proc/log/000/000022/000022333')

        self.assertEqual(
            get_log_dir(22333999),
            '/net/kamaka/export/data/jsa_proc/log/022/022333/022333999')

        # Test what happens with a billion or more job IDs.
        self.assertEqual(
            get_log_dir(1999000999),
            '/net/kamaka/export/data/jsa_proc/log/1999/1999000/1999000999')

        with self.assertRaises(JSAProcError):
            get_input_dir('not an integer')
Пример #11
0
def transfer_poll(db, task=None, dry_run=False):
    # Get full list of tasks.
    task_info = db.get_task_info()

    # Determine whether we are already the correct user
    # on the correct machine for e-transfer or not.
    try:
        etransfer_check_config()
        etransfer_needs_ssh = False
    except:
        etransfer_needs_ssh = True

    logger.info('Starting check for jobs to transfer')
    n_err = 0

    for job in db.find_jobs(location='JAC',
                            state=JSAProcState.PROCESSED,
                            task=task):
        try:
            job_task_info = task_info.get(job.task)

            if job_task_info is None:
                # Don't know if this should be e-transferred or not,
                # so do nothing for now.
                # Eventually this should probably raise an
                # error, if we wish to ensure all tasks are
                # entered in the task table.
                logger.debug(
                    'Processed job %i unchanged: ' +
                    'no etransfer option for task', job.id)

            elif job_task_info.command_xfer is not None:
                logger.debug(
                    'Running custom transfer command '
                    'for processed job %i', job.id)

                if not dry_run:
                    # The job is transferred by a custom process.
                    # Mark the job as transferring while this runs.
                    db.change_state(job.id,
                                    JSAProcState.TRANSFERRING,
                                    'Transferring via custom command',
                                    state_prev=JSAProcState.PROCESSED)

                    out_dir = get_output_dir(job.id)

                    try:
                        with open_log_file(job.id, 'transfer') as log:
                            subprocess.check_call([
                                job_task_info.command_xfer,
                                '--transdir',
                                out_dir,
                            ],
                                                  shell=False,
                                                  cwd='/tmp',
                                                  stdout=log,
                                                  stderr=subprocess.STDOUT,
                                                  preexec_fn=restore_signals)

                        # Change the state to complete, unless we have a custom
                        # ingestion command to run.
                        db.change_state(
                            job.id, (JSAProcState.COMPLETE
                                     if job_task_info.command_ingest is None
                                     else JSAProcState.INGESTION),
                            'Custom transfer completed successfully',
                            state_prev=JSAProcState.TRANSFERRING)

                    except subprocess.CalledProcessError as e:
                        logger.exception(
                            'Custom transfer command failed '
                            'for processed job %i', job.id)

                        db.change_state(job.id,
                                        JSAProcState.ERROR,
                                        'Custom transfer failed',
                                        state_prev=JSAProcState.TRANSFERRING)

                        n_err += 1

            elif job_task_info.etransfer is None:
                # If etransfer is set to None, don't etransfer
                # but also don't move to complete.
                logger.debug(
                    'Processed job %i unchanged: ' +
                    'task etransfer option is NULL', job.id)

            elif not job_task_info.etransfer:
                # If e-transfer is not required, then the job is now
                # complete (only done if etransfer argument is False).
                # Don't validate output when "raw_output" specified.
                if job_task_info.raw_output or validate_output(
                        job.id, db, dry_run=dry_run):
                    if not dry_run:
                        db.change_state(
                            job.id,
                            JSAProcState.COMPLETE,
                            'Processed job is COMPLETE (no etransfer)',
                            state_prev=JSAProcState.PROCESSED)
                    logger.debug(
                        'Processed job %i moved to ' +
                        'COMPLETE (no etransfer)', job.id)

            else:
                # If this task should be e-transferred, attempt to
                # add to e-transfer and move to TRANSFERRING.
                if validate_output(job.id, db, dry_run=dry_run):
                    # Only e-transfer via SSH if needed.
                    if etransfer_needs_ssh:
                        logger.debug(
                            'E-transferring output '
                            'of job %i via SSH', job.id)
                        if not dry_run:
                            ssh_etransfer_send_output(job.id)
                    else:
                        logger.debug(
                            'E-transferring output '
                            'of job %i directly', job.id)
                        if not dry_run:
                            etransfer_send_output(job.id)

        except Exception:
            logger.exception('Error while transferring job %i', job.id)
            n_err += 1

    logger.info('Done checking for jobs to transfer')

    return False if n_err else True
Пример #12
0
from jsa_proc.config import get_database
from jsa_proc.state import JSAProcState
from jsa_proc.admin.directories import get_output_dir
import os
from starlink import kappa

db = get_database()

x = db.find_jobs(state=JSAProcState.COMPLETE,
                 task="cal-s2-noext",
                 outputs="%.sdf")

dummy = 0

for eachjob in range(len(x)):

    jobid = x[eachjob].id

    #sdffiles = []
    #for eachfile in x[eachjob].outputs:
    #    sdffiles.append(str(eachfile))

    outputdir = get_output_dir(jobid)
    print outputdir

#    for eachfile in sdffiles:
#        os.system('cp '+outputdir+'/'+eachfile+' .')
Пример #13
0
def _etransfer_send(job_id, dry_run, db, force):
    """Private function to copy job output into the e-transfer
    directories.

    Runs under the ErrorDecorator so that errors are captured.
    """

    config = get_config()
    scratchdir = config.get('etransfer', 'scratchdir')
    transdir = config.get('etransfer', 'transdir')
    group_id = grp.getgrnam(config.get('etransfer', 'group')).gr_gid

    logger.debug('Retrieving list of output files')
    try:
        file_info = db.get_output_files(job_id, with_info=True)
        files = [x.filename for x in file_info]

    except NoRowsError:
        raise CommandError('No output files found for job {0}'.format(job_id))

    logger.debug('Checking that the MD5 sum for each file is defined')
    for info in file_info:
        if info.md5 is None:
            raise CommandError('File {0} MD5 sum is missing from database'.
                               format(info.filename))

    logger.debug('Checking that all files are present')
    outdir = get_output_dir(job_id)
    for file in files:
        if not os.path.exists(os.path.join(outdir, file)):
            raise CommandError('File {0} not in directory {1}'.
                               format(file, outdir))

    logger.debug('Checking that files are not in the scratch directory')
    scratchfiles = os.listdir(scratchdir)
    for file in files:
        if file in scratchfiles:
            raise CommandError('File {0} is in e-transfer scratch directory'.
                               format(file))

    logger.debug('Checking whether the files are already in e-transfer')
    etransfer_status = etransfer_file_status(files)
    if any(etransfer_status):
        for (file, status) in zip(files, etransfer_status):
            if status is not None:
                (ok, dir) = status
                logger.error('File {0} already in e-transfer directory {1}'.
                             format(file, dir))
        raise CommandError('Some files are already in e-transfer directories')

    for info in file_info:
        file = info.filename
        cadc_file_info = fetch_cadc_file_info(file)

        if cadc_file_info is not None:
            # We need to check whether the file is not, in fact, different
            # from the current version, because in that case we are not
            # allowed to "replace" it.
            cadc_file_md5 = cadc_file_info['content-md5']

            if info.md5 == cadc_file_md5:
                logger.info('File %s in unchanged, skipping replacement',
                            file)
                continue

            target_type = 'replace'
        else:
            target_type = 'new'

        logger.info('Placing file %s in "%s" directory', file, target_type)

        source_file = os.path.join(outdir, file)
        scratch_file = os.path.join(scratchdir, file)
        target_file = os.path.join(transdir, target_type, file)

        if not dry_run:
            # Copy the file into the scratch directory and prepare its
            # file permissions.
            shutil.copyfile(source_file, scratch_file)
            os.chown(scratch_file, -1, group_id)
            os.chmod(scratch_file, 0o664)

            # Move the file to the target directory.  This is done so that
            # the file appears atomically in the target directory in order
            # to prevent the e-transfer system seeing only part of the file.
            os.rename(scratch_file, target_file)

        else:
            logger.debug('Skipping e-transfer (DRY RUN)')

    # Finally set the state of the job to TRANSFERRING
    if not dry_run:
        db.change_state(
            job_id, JSAProcState.TRANSFERRING,
            'Output files have been copied into the e-transfer directories',
            state_prev=(None if force else JSAProcState.PROCESSED))
Пример #14
0
def transfer_poll(db):
    # Get full list of tasks.
    task_info = db.get_task_info()

    # Determine whether we are already the correct user
    # on the correct machine for e-transfer or not.
    try:
        etransfer_check_config()
        etransfer_needs_ssh = False
    except:
        etransfer_needs_ssh = True

    logger.info('Starting check for jobs to transfer')
    n_err = 0

    for job in db.find_jobs(location='JAC', state=JSAProcState.PROCESSED):
        try:
            job_task_info = task_info.get(job.task)

            if job_task_info is None:
                # Don't know if this should be e-transferred or not,
                # so do nothing for now.
                # Eventually this should probably raise an
                # error, if we wish to ensure all tasks are
                # entered in the task table.
                logger.debug('Processed job %i unchanged: ' +
                             'no etransfer option for task',
                             job.id)

            elif job_task_info.command_xfer is not None:
                # The job is transferred by a custom process.
                # Mark the job as transferring while this runs.
                db.change_state(job.id, JSAProcState.TRANSFERRING,
                                'Transferring via custom command',
                                state_prev=JSAProcState.PROCESSED)

                logger.debug('Running custom transfer command '
                             'for processed job %i',
                             job.id)
                out_dir = get_output_dir(job.id)

                try:
                    with open_log_file(job.id, 'transfer') as log:
                        subprocess.check_call(
                            [
                                job_task_info.command_xfer,
                                '--transdir', out_dir,
                            ],
                            shell=False,
                            cwd='/tmp',
                            stdout=log,
                            stderr=subprocess.STDOUT,
                            preexec_fn=restore_signals)

                    # Change the state to complete, unless we have a custom
                    # ingestion command to run.
                    db.change_state(
                        job.id,
                        (JSAProcState.COMPLETE
                            if job_task_info.command_ingest is None
                            else JSAProcState.INGESTION),
                        'Custom transfer completed successfully',
                        state_prev=JSAProcState.TRANSFERRING)

                except subprocess.CalledProcessError as e:
                    logger.exception('Custom transfer command failed '
                                     'for processed job %i',
                                     job.id)

                    db.change_state(job.id, JSAProcState.ERROR,
                                    'Custom transfer failed',
                                    state_prev=JSAProcState.TRANSFERRING)

                    n_err += 1

            elif job_task_info.etransfer is None:
                # If etransfer is set to None, don't etransfer
                # but also don't move to complete.
                logger.debug('Processed job %i unchanged: ' +
                             'task etransfer option is NULL',
                             job.id)

            elif not job_task_info.etransfer:
                # If e-transfer is not required, then the job is now
                # complete (only done if etransfer argument is False).
                # Don't validate output when "raw_output" specified.
                if job_task_info.raw_output or validate_output(job.id, db):
                    db.change_state(
                        job.id, JSAProcState.COMPLETE,
                        'Processed job is COMPLETE (no etransfer)',
                        state_prev=JSAProcState.PROCESSED)
                    logger.debug('Processed job %i moved to ' +
                                 'COMPLETE (no etransfer)',
                                 job.id)

            else:
                # If this task should be e-transferred, attempt to
                # add to e-transfer and move to TRANSFERRING.
                if validate_output(job.id, db):
                    # Only e-transfer via SSH if needed.
                    if etransfer_needs_ssh:
                        logger.debug('E-transferring output '
                                     'of job %i via SSH', job.id)
                        ssh_etransfer_send_output(job.id)
                    else:
                        logger.debug('E-transferring output '
                                     'of job %i directly', job.id)
                        etransfer_send_output(job.id)

        except Exception:
            logger.exception('Error while transferring job %i', job.id)
            n_err += 1

    logger.info('Done checking for jobs to transfer')

    return False if n_err else True
Пример #15
0
def _fetch_job_output(job_id, db, force=False, dry_run=False):
    """Private function to perform retrieval of job output files from CADC.
    """

    # Check we have sufficient disk space for fetching to occur.
    output_space = get_output_dir_space()
    required_space = float(get_config().get('disk_limit', 'fetch_min_space'))

    if output_space < required_space and not force:
        logger.warning('Insufficient disk space: %f / %f GiB required',
                       output_space, required_space)
        return

    logger.info('About to retreive output data for job %i', job_id)

    # Change state from INGEST_QUEUE to INGEST_FETCH.
    if not dry_run:
        try:
            db.change_state(
                job_id,
                JSAProcState.INGEST_FETCH,
                'Output data are being retrieved',
                state_prev=(None if force else JSAProcState.INGEST_QUEUE))
        except NoRowsError:
            logger.error(
                'Job %i cannot have output data fetched'
                ' as it not waiting for reingestion', job_id)
            return

    # Check state of output files.
    output_dir = get_output_dir(job_id)
    output_files = db.get_output_files(job_id, with_info=True)
    missing_files = []

    for file in output_files:
        filename = file.filename
        filepath = os.path.join(output_dir, filename)

        if os.path.exists(filepath):
            # If we still have the file, check its MD5 sum is correct.
            if file.md5 is None:
                logger.warning('PRESENT without MD5 sum: %s', filename)
            elif file.md5 == get_md5sum(filepath):
                logger.debug('PRESENT: %s', filename)
            else:
                raise JSAProcError(
                    'MD5 sum mismatch for existing file {0}'.format(filename))

        else:
            # Otherwise add it to the list of missing files.
            logger.debug('MISSING: %s', filename)
            missing_files.append(file)

    # Are there any files we need to retrieve?
    if missing_files:
        for file in missing_files:
            filename = file.filename
            filepath = os.path.join(output_dir, filename)

            if not dry_run:
                if os.path.exists(output_dir):
                    logger.debug('Directory %s already exists', output_dir)
                else:
                    logger.debug('Making directory %s', output_dir)
                    os.makedirs(output_dir)

                logger.info('Fetching file %s', filename)
                fetch_cadc_file(filename, output_dir, suffix='')

                if file.md5 is None:
                    logger.warning('MD5 sum missing: %s', filename)
                elif file.md5 == get_md5sum(filepath):
                    logger.debug('MD5 sum OK: %s', filename)
                else:
                    raise JSAProcError(
                        'MD5 sum mismatch for fetched file {0}'.format(
                            filename))
            else:
                logger.info('Skipping fetch of %s (DRY RUN)', filename)

    else:
        logger.info('All output files are already present')

    # Finally set the state to INGESTION.
    if not dry_run:
        db.change_state(job_id,
                        JSAProcState.INGESTION,
                        'Output data have been retrieved',
                        state_prev=JSAProcState.INGEST_FETCH)
Пример #16
0
def jsawrapdr_run(job_id, input_file_list, mode, drparameters,
                  cleanup='cadc', location='JAC', persist=False,
                  jsawrapdr=None, starlink_dir=None,
                  version=None, command_run=None,
                  raw_output=None):
    """
    Execute jsawrapdr script from python.

    This function calls jsawrapdr with following options:

    jsawrapdr --outdir=configbase/scratch/$job_id
              --inputs=input_file_list
              --id = jac-$job_id
              --mode=$mode
              --drparameters=$drparameters
              --cleanup=$cleanup (cadc by default)
              --location=$location (JAC by default)
              --fileversion=$version (if not None)
              --drcommand=$command_run (if not None)

         if persist is True, then it adds the flag:
              -persist

         if raw_output is True, it adds the option:
              --rawoutput

    Args:

      job_id (int): Job identifier from jsaproc database.

      input_file_list (str): List of files (with extensions and full
        path).

      mode (str): Can be 'night', 'obs', 'public' or 'project'.

      drparameters (str):

      cleanup (str, optional): Type of cleanup. Can be one of
        'cadc'|'none'|'all', defaults to 'cadc'.

      persist (bool, optional): Defaults to False If persist is turned
        on, then dpCapture will copy acceptable products to the
        default output directory. Otherwise it won't (used for
        debugging purposes). The output directory is determined by
        jsa_proc.admin.directories 'get_output_dir' for the given
        job_id.

      location (str, optional): One of |'cadc'|'JAC'| (NOT CURRENTLY
        IMPLEMENTED, default is 'JAC')


      jsawrapdr (str, optional): The path to jsawrapdr. If not given,
        the one in configured starlink will be used.

      starlink_dir (str, optional): The path of a starlink install to
        use. If not given, the one found in the configuration file will be
        used.

      version: CADC file name "version" or None to use default.

      command_run: custom "run" command to be passed to jsawrapdr.

    Returns:
      str: The filename (including path) of the logfile.

    """

    # Get log directory.  Note that opening a log file in this
    # directory using open_log_file will ensure that it exists.
    log_dir = get_log_dir(job_id)

    # Prepare scratch directory.
    scratch_dir = make_temp_scratch_dir(job_id)

    # Get output directory name.
    out_dir = get_output_dir(job_id)

    # If output dir currently exists, delete the directory.
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)

    # Make the "transfer" directory in advance.  (This saves dpCapture
    # or another copying routine from having to do so.)
    os.makedirs(out_dir)

    # Find paths to starlink, jsawrapdr and orac_dr.
    config = get_config()

    if starlink_dir is None:
        starpath = config.get('job_run', 'starpath')
    else:
        starpath = starlink_dir
    if not jsawrapdr:
        jsawrapdr = os.path.join(starpath, 'Perl', 'bin', 'jsawrapdr')
    orac_dir = os.path.join(starpath, 'bin', 'oracdr', 'src')

    # Set thejac recipe id.
    jacid = 'jac-'+str(job_id)

    # Collect the jsawrapdr arguments.
    jsawrapdrcom = [jsawrapdr,
                    '--debugxfer',
                    '--outdir='+scratch_dir,
                    '--inputs='+input_file_list,
                    '--id='+jacid,
                    '--mode='+mode,
                    '--cleanup='+cleanup,
                    '--drparameters='+drparameters]
    if persist:
        jsawrapdrcom.append('-persist')
        jsawrapdrcom.append('--transdir='+out_dir)

    if raw_output:
        jsawrapdrcom.append('--rawoutput')

    if version is not None:
        jsawrapdrcom.append('--fileversion={0}'.format(version))

    if command_run is not None:
        jsawrapdrcom.append('--drcommand={0}'.format(command_run))

    # Set up the environment for running jsawrapdr.
    jsa_env = os.environ.copy()
    jsa_env = setup_starlink(starpath, jsa_env)

    # Add in the LOGDIR
    jsa_env['ORAC_LOGDIR'] = log_dir

    # Ensure that we delete the results of previous log.* files the ORAC_LOGDIR if they exist.
    if os.path.exists(log_dir):
        calculation_logs = glob.glob(os.path.join(log_dir, 'log.*'))
        for cl in calculation_logs:
            os.remove(cl)

    # Open a log file and run jsawrapdr while saving output to log.
    with open_log_file(job_id, 'jsawrapdr') as log:

        # Save the log file name.
        log_name = log.name

        # Run jsawrapdr.
        retcode = subprocess.call(jsawrapdrcom, env=jsa_env, bufsize=1,
                                  stdout=log, stderr=subprocess.STDOUT,
                                  preexec_fn=restore_signals)

    # Handle jsawrapdr errors.
    if retcode != 0:
        errormessage = 'jsawrapdr exited with Retcode %i ' % (retcode)

        # Find the first ORAC error message in the jsawrapdr log.
        jsalogfile = open(log_name, 'r')
        lines = jsalogfile.read()
        jsalogfile.close()
        result = re.search(r'.*(STDERR:\s*.*)$', lines, re.DOTALL)
        if result:
            firsterror = result.group(1).split('\n')[1]

            # Insert the ORAC error at the start of the error message.
            if firsterror:
                errormessage = 'ORAC ERROR: ' + firsterror + '.\n' + \
                               errormessage

        # Raise the error.
        raise JSAProcError(errormessage)

    return log_name
Пример #17
0
def _perform_ingestion(job_id, db, command_ingest=None):
    """Private function to peform the ingestion.

    Runs under the ErrorDecorator to capture errors.  Sets the job state
    to COMPLETE if it finishes successfully, or ERROR otherwise.
    """

    logger.debug('Preparing to ingest ouput for job {0}'.format(job_id))

    output_dir = get_output_dir(job_id)

    logger.debug('Checking that output files are present for ingestion')
    try:
        output_files = db.get_output_files(job_id)
        for filename in output_files:
            if not os.path.exists(os.path.join(output_dir, filename)):
                raise JSAProcError(
                    'Output file {0} is missing'.format(filename))
    except NoRowsError:
        raise JSAProcError('Job has no output files to ingest')

    with open_log_file(job_id, 'ingestion') as log:
        try:
            if command_ingest is None:
                scratch_dir = make_temp_scratch_dir(job_id)
                logger.debug('Using scratch directory %s', scratch_dir)

                logger.debug('Invoking jsaingest, log file: %s', log.name)

                subprocess.check_call(
                    [
                        'jsaingest',
                        '--ingest',
                        '--collection', 'JCMT',
                        '--indir', output_dir,
                    ],
                    shell=False,
                    cwd=scratch_dir,
                    stdout=log,
                    stderr=subprocess.STDOUT,
                    preexec_fn=restore_signals)

            else:
                logger.debug(
                    'Invoking custom ingestion script %s, log file: %s',
                    command_ingest, log.name)

                subprocess.check_call(
                    [
                        command_ingest,
                        '--transdir', output_dir,
                    ],
                    shell=False,
                    cwd='/tmp',
                    stdout=log,
                    stderr=subprocess.STDOUT,
                    preexec_fn=restore_signals)

            db.change_state(job_id, JSAProcState.COMPLETE,
                            'Ingestion completed successfully',
                            state_prev=JSAProcState.INGESTING)

            logger.info('Done ingesting ouput for job {0}'.format(job_id))

        except subprocess.CalledProcessError as e:
            # Attempt to get the first message beginning with ERROR from
            # the log file.

            # Go back to the start of the log and read in the data.
            log.seek(0)
            content = '\n'.join(log.readlines())
            errorline = content[content.find('\nERROR '):].split('\n')[1]

            db.change_state(job_id, JSAProcState.ERROR,
                            'Ingestion failed\n' + errorline)

            logger.exception('Error during ingestion of job %i', job_id)
Пример #18
0
def make_output_file_list(db, job_id, preview_filter=None):
    """Prepare output file lists for job information pages.
    """

    output_files = []
    previews1024 = []
    previews256 = []

    try:
        for i in sorted(db.get_output_files(job_id)):
            url = None
            mtype = None

            if i.endswith('.png'):
                url = url_for('job_preview', job_id=job_id, preview=i)

                if preview_filter is None or any((f in i for f in preview_filter)):
                    caption = i
                    caption = re.sub('^jcmt_', '', caption)
                    caption = re.sub('_(preview_)?\d+\.png', '', caption)

                    if '_256.png' in i:
                        previews256.append(PreviewInfo(url, caption))

                    if '_1024.png' in i:
                        previews1024.append(PreviewInfo(url, caption))

            elif i.endswith('.pdf'):
                url = url_for('job_preview_pdf', job_id=job_id, preview=i)

            elif i.endswith('.txt'):
                url = url_for('job_text_file', job_id=job_id, text_file=i)

            elif i.endswith('.fits'):
                url = 'file://{0}/{1}'.format(get_output_dir(job_id), i)

                if re.search('-cat[0-9]{6}', i):
                    mtype = 'table.load.fits'

                elif re.search('-moc[0-9]{6}', i):
                    # This should be "coverage.load.moc.fits" but neither GAIA
                    # nor Aladin appear to subscribe to that mtype yet.
                    # mtype = 'coverage.load.moc.fits'
                    mtype = 'image.load.fits'

                elif '_rsp_' in i:
                    # Prevent a broadcast button being shown for spectra
                    # for now.
                    mtype = None

                else:
                    mtype = 'image.load.fits'

                # Remove URL for types we can't broadcast.
                if mtype is None:
                    url = None

            output_files.append(FileInfo(i, url, mtype))

    except NoRowsError:
        pass

    return (output_files, previews1024, previews256)
Пример #19
0
def _etransfer_send(job_id, dry_run, db, force):
    """Private function to copy job output into the e-transfer
    directories.

    Runs under the ErrorDecorator so that errors are captured.
    """

    config = get_config()
    scratchdir = config.get('etransfer', 'scratchdir')
    transdir = config.get('etransfer', 'transdir')
    group_id = grp.getgrnam(config.get('etransfer', 'group')).gr_gid

    logger.debug('Retrieving list of output files')
    try:
        file_info = db.get_output_files(job_id, with_info=True)
        files = [x.filename for x in file_info]

    except NoRowsError:
        raise CommandError('No output files found for job {0}'.format(job_id))

    logger.debug('Checking that the MD5 sum for each file is defined')
    for info in file_info:
        if info.md5 is None:
            raise CommandError(
                'File {0} MD5 sum is missing from database'.format(
                    info.filename))

    logger.debug('Checking that all files are present')
    outdir = get_output_dir(job_id)
    for file in files:
        if not os.path.exists(os.path.join(outdir, file)):
            raise CommandError('File {0} not in directory {1}'.format(
                file, outdir))

    logger.debug('Checking that files are not in the scratch directory')
    scratchfiles = os.listdir(scratchdir)
    for file in files:
        if file in scratchfiles:
            raise CommandError(
                'File {0} is in e-transfer scratch directory'.format(file))

    logger.debug('Checking whether the files are already in e-transfer')
    etransfer_status = etransfer_file_status(files)
    if any(etransfer_status):
        for (file, status) in zip(files, etransfer_status):
            if status is not None:
                (ok, dir) = status
                logger.error(
                    'File {0} already in e-transfer directory {1}'.format(
                        file, dir))
        raise CommandError('Some files are already in e-transfer directories')

    for info in file_info:
        file = info.filename
        cadc_file_info = fetch_cadc_file_info(file)

        if cadc_file_info is not None:
            # We need to check whether the file is not, in fact, different
            # from the current version, because in that case we are not
            # allowed to "replace" it.
            cadc_file_md5 = cadc_file_info['content-md5']

            if info.md5 == cadc_file_md5:
                logger.info('File %s in unchanged, skipping replacement', file)
                continue

            target_type = 'replace'
        else:
            target_type = 'new'

        logger.info('Placing file %s in "%s" directory', file, target_type)

        source_file = os.path.join(outdir, file)
        scratch_file = os.path.join(scratchdir, file)
        target_file = os.path.join(transdir, target_type, file)

        if not dry_run:
            # Copy the file into the scratch directory and prepare its
            # file permissions.
            shutil.copyfile(source_file, scratch_file)
            os.chown(scratch_file, -1, group_id)
            os.chmod(scratch_file, 0o664)

            # Move the file to the target directory.  This is done so that
            # the file appears atomically in the target directory in order
            # to prevent the e-transfer system seeing only part of the file.
            os.rename(scratch_file, target_file)

        else:
            logger.debug('Skipping e-transfer (DRY RUN)')

    # Finally set the state of the job to TRANSFERRING
    if not dry_run:
        db.change_state(
            job_id,
            JSAProcState.TRANSFERRING,
            'Output files have been copied into the e-transfer directories',
            state_prev=(None if force else JSAProcState.PROCESSED))
Пример #20
0
def make_output_file_list(db, job_id, preview_filter=None):
    """Prepare output file lists for job information pages.
    """

    output_files = []
    previews1024 = []
    previews256 = []

    try:
        for i in sorted(db.get_output_files(job_id)):
            url = None
            mtype = None

            if i.endswith('.png'):
                url = url_for('job_preview', job_id=job_id, preview=i)

                if preview_filter is None or any(
                    (f in i for f in preview_filter)):
                    caption = i
                    caption = re.sub('^jcmt_', '', caption)
                    caption = re.sub('_(preview_)?\d+\.png', '', caption)

                    if '_256.png' in i:
                        previews256.append(PreviewInfo(url, caption))

                    if '_1024.png' in i:
                        previews1024.append(PreviewInfo(url, caption))

            elif i.endswith('.pdf'):
                url = url_for('job_preview_pdf', job_id=job_id, preview=i)

            elif i.endswith('.txt'):
                url = url_for('job_text_file', job_id=job_id, text_file=i)

            elif i.endswith('.fits'):
                url = 'file://{0}/{1}'.format(get_output_dir(job_id), i)

                if re.search('-cat[0-9]{6}', i):
                    mtype = 'table.load.fits'

                elif re.search('-moc[0-9]{6}', i):
                    # This should be "coverage.load.moc.fits" but neither GAIA
                    # nor Aladin appear to subscribe to that mtype yet.
                    # mtype = 'coverage.load.moc.fits'
                    mtype = 'image.load.fits'

                elif '_rsp_' in i:
                    # Prevent a broadcast button being shown for spectra
                    # for now.
                    mtype = None

                else:
                    mtype = 'image.load.fits'

                # Remove URL for types we can't broadcast.
                if mtype is None:
                    url = None

            output_files.append(FileInfo(i, url, mtype))

    except NoRowsError:
        pass

    return (output_files, previews1024, previews256)