Exemplo n.º 1
0
def put_cadc_file(filename, input_directory, ad_stream):
    """Put the given file into the CADC archive.

    The CADC AD "stream" for the PUT request must be given.

    Raises a JSAProcError on failure.
    """

    (args, kwargs) = _prepare_cadc_request(filename)
    r = None

    try:
        with open(os.path.join(input_directory, filename), 'rb') as f:
            kwargs['data'] = f
            kwargs['headers'] = {'X-CADC-Stream': ad_stream}

            r = requests.put(*args, **kwargs)

            r.raise_for_status()

            if r.status_code in (200, 201):
                return

    except RequestException as e:
        text = 'no text received' if r is None else r.text
        raise JSAProcError('Error putting CADC file: {0}: {1}'.format(
            str(e), text))

    raise JSAProcError('Putting CADC file gave bad status: {0}: {1}'.format(
        r.status_code, r.text))
Exemplo n.º 2
0
def get_output_files(job_id):
    """
    Get the current list of output files from the output directory.

    This command trusts that whatever is in the output directory at the
    time it is called is the correct list of output files.

    parameter:
    job_id, integer

    returns: list of JSAProcFileInfo objects.
    Each object contains a plain filename, with no path attached.
    """

    # find output_dir
    output_dir = get_output_dir(job_id)

    # Check it exists and is a directory: raise error if not
    if not os.path.exists(output_dir) or not os.path.isdir:
        raise JSAProcError(
            'The output directory %s for job %i does not exist' %
            (output_dir, job_id))

    # Get list of files in directory:
    contents = os.listdir(output_dir)

    return [JSAProcFileInfo(x, get_md5sum(os.path.join(output_dir, x)))
            for x in contents]
Exemplo n.º 3
0
def assemble_input_data_for_job(job_id, input_file_list):
    """
    This routine ensure that all the input data in the input table is
    available for running a job.

     It will check to see if the data is present in either a) the
    /jcmtdata tree, or b) the input directory for this job. If it is
    not present it will download the data from CADC into the input
    directory. It will create the input directory if not present.

    parameters;
    job_id: integer, id of job in job database.

    input_file_list: list of strings.
    iterable of strings, each string being the name of file.
    filenames must not include suffix.

    return: name of file output directory containing one filename per
    string, for every file.
    """

    # Get full path to input directory and make it if it doesn't exist.
    input_directory = setup_input_directory(job_id)

    # For each file, check if its already in JAC data store, or input
    # directory. Download from CADC if its not. Check downloaded files
    # are valid hds.
    files_list = []
    for f in input_file_list:

        filepath = file_in_jac_data_dir(f)

        if filepath:
            files_list.append(filepath)

        else:
            filepath = file_in_dir(f, input_directory)

            if filepath:
                files_list.append(filepath)
            else:
                filepath = fetch_cadc_file(f, input_directory)
                valid = valid_hds(filepath)

                if not valid:

                    # Move invalid file to different directory and raise an
                    # error.
                    invalid_dir = setup_invalid_dir(input_directory)
                    invalid_file = os.path.join(invalid_dir,
                                                os.path.split(filepath)[1])
                    shutil.move(filepath, invalid_file)
                    raise JSAProcError(
                        'Downloaded file %s fails hds validation'
                        ' Moved to %s' % (filepath, invalid_file))
                else:
                    files_list.append(filepath)

    # Return list of files with full paths.
    return files_list
Exemplo n.º 4
0
def get_jac_data_dir(filename):
    """Guess directory name for a given filename.

    Given a bare raw data filename, return a list of standardized
    directory names giving where that file should be located.
    """

    m = scuba2_file.match(filename)
    if m:
        (subarray, date, obsnum) = m.groups()

        path1 = os.path.join('/jcmtdata/raw/scuba2', subarray, date, obsnum)
        path2 = os.path.join('/jcmtcal/scuba2', subarray, date, obsnum)
        return (path1, path2)

    m = acsis_file.match(filename)
    if m:
        (date, obsnum) = m.groups()

        if date > '20061000':
            path1 = os.path.join('/jcmtdata/raw/acsis/spectra', date, obsnum)
            path2 = os.path.join('/jcmtcal/acsis', date, obsnum)
            return (path1, path2)

        else:
            year = date[0:4]
            path1 = os.path.join('/jcmtdata/raw/acsis-das/converted',
                                 year, date, obsnum)
            return (path1,)

    raise JSAProcError('Filename {0} does not match '
                       'an expected pattern'.format(filename))
Exemplo n.º 5
0
    def obsids_by_pattern(self, pattern, with_productid=False):
        """Retrueve list of obsids matching a given pattern.

        The pattern should be in lower case.
        """

        result = []

        table = self.tap.query(
            'SELECT lower(Observation.observationID), Plane.productID '
            'FROM caom2.Observation as Observation '
            'JOIN caom2.Plane as Plane ON Observation.obsID = Plane.obsID '
            'WHERE ( Observation.collection = \'JCMT\' '
            'AND lower(Observation.observationID) LIKE \'{0}\' '
            'AND Plane.calibrationLevel = 0 '
            ')'.format(pattern))

        if table is None:
            raise JSAProcError(
                'Failed TAP query for observation ID like {0}'.format(pattern))

        for (obsid, productid) in table:
            if with_productid:
                result.append((obsid, productid))
            else:
                result.append(obsid)

        return result
Exemplo n.º 6
0
    def get_info(cls, state):
        """Return a StateInfo object describing the state.

        Raises JSAProcError if the state does not exist.
        """

        try:
            return cls._info[state]
        except KeyError:
            raise JSAProcError('Unknow state code {0}'.format(state))
Exemplo n.º 7
0
    def get_name(cls, state):
        """Return the human-readable name of the state.

        Raises JSAProcError if the state does not exist.
        """

        try:
            return cls._info[state].name
        except KeyError:
            raise JSAProcError('Unknown state code {0}'.format(state))
Exemplo n.º 8
0
    def files_by_pattern(self, pattern):
        """Retrieve list of files matching a given pattern.
        """

        try:
            r = requests.get(self.jcmt_info_url, params={'file': pattern})
            r.raise_for_status()
            return latin_1_encode(r.text)[0].strip().split('\n')

        except HTTPError as e:
            raise JSAProcError('Error fetching CADC file list: ' + str(e))
Exemplo n.º 9
0
    def lookup_name(cls, name):
        """Return the state code corresponding to the given name.

        Raises JSAProcError if the state name is not recognised.

        Names are compared in a case-insensitive manner.
        """

        lowername = name.lower()

        for (state, info) in cls._info.items():
            if lowername == info.name.lower():
                return state

        raise JSAProcError('Unknown state name {0}'.format(name))
Exemplo n.º 10
0
def identifier_to_pattern(identifier, patterns):
    """Look for a suitable pattern for an identifier.

    Takes a list of (regexp, pattern) pairs.  Returns the
    pattern substituted with the regexp match groups
    for the first matching regexp.
    """

    for (regexp, pattern) in patterns:
        match = regexp.match(identifier)

        if match:
            return pattern.format(*match.groups())

    raise JSAProcError('Pattern for "{0}" not recognised'.format(identifier))
Exemplo n.º 11
0
def fetch_cadc_file(filename, output_directory, suffix='.sdf'):
    """
    Routine which will fetch a file from CADC and save it into the output
    directory. It assumes the url is of the form:
    http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/data/pub/JCMT/s4d20130401_00001_0002

    parameters;
    filename, string
    This assumes a filename without extension or path.

    output_directory, string
    Path to save file to.

    suffix: additional suffix to be added to the filename
    before saving to the output directory.
    (string, default: ".sdf")

    Will raise an JSAProcError if it can't connect.

    Returns name of file with path
    """

    # Local name to save to (requests automatically decompresses, so
    # don't need the .gz).
    local_file = filename + suffix
    output_file_path = os.path.join(output_directory, local_file)

    try:
        (args, kwargs) = _prepare_cadc_request(filename)

        # Connect with stream=True for large files.
        kwargs['stream'] = True

        r = requests.get(*args, **kwargs)

        # Check if its worked. (raises error if not okay)
        r.raise_for_status()

        # write out to a file in the requested output directory
        with open(output_file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                f.write(chunk)

    except RequestException as e:
        raise JSAProcError('Error fetching CADC file: ' + str(e))

    return output_file_path
Exemplo n.º 12
0
def _get_dir(type_, job_id):
    if not isinstance(job_id, int):
        raise JSAProcError('Cannot determine directory '
                           'for non-integer job identifier')

    config = get_config()
    basedir = config.get('directories', type_)

    # Turn the job ID into a decimal string of at least 9
    # digits, then create subdirectories by removing the last 6
    # and then the last 3 digits.  This means that we retain the
    # full length name in the final directory (unlike Git) to
    # try to prevent accidental collisions if the directories are
    # manipulated manually.  The digits are counted back from the
    # end of the decimal string so that any digits in excess of
    # the fixed 9 end up in the first component.
    decimal = '{0:09d}'.format(job_id)
    return os.path.join(basedir, decimal[:-6], decimal[:-3], decimal)
Exemplo n.º 13
0
    def search_file(self, pattern, archive='JCMT', timeout=300):
        result = []

        table = self.tap.query('SELECT fileID, contentMD5 '
                               'FROM archive_files '
                               'WHERE ('
                               'archiveName = \'{}\' '
                               'AND fileID LIKE \'{}\''
                               ')'.format(archive, pattern),
                               timeout=timeout)

        if table is None:
            raise JSAProcError(
                'Failed TAP query for AD files like {}'.format(pattern))

        for (id_, md5) in table:
            result.append(ADFileInfo(id_, md5))

        return result
Exemplo n.º 14
0
def get_config():
    """Read the configuration file.

    Returns a SafeConfigParser object.
    """

    global config

    if config is None:
        dir = get_home()
        file = os.path.join(dir, config_file)

        if not os.path.exists(file):
            raise JSAProcError('Config file {0} doesn\'t exist'.format(file))

        config = SafeConfigParser()
        config.read(file)

    return config
Exemplo n.º 15
0
def get_output_log_files(job_id):
    """
    Get the current list of output log.* files from the log directory.

    This command trusts whatever is in the log directory and starts
    with log.* is the correct list of output log files.

    Returns: list of bare file names
    """
    log_dir = get_log_dir(job_id)

    if not os.path.exists(log_dir) or not os.path.isdir:
        raise JSAProcError(
            'The log directory %s for job %i does not exist.' % (log_dir, job_id))

    pattern = re.compile('log.*')
    logs = [i for i in os.listdir(log_dir) if pattern.match(i)]

    return logs
Exemplo n.º 16
0
    def __exit__(self, type_, value, tb):
        """Context manager block exit method.

        If the block exited cleanly, commit, otherwise rollback
        the current transaction.  Also closes the cursor object.
        """

        if type_ is None:
            self._conn.commit()
        else:
            self._conn.rollback()

        self._cursor.close()
        del self._cursor

        self._lock.release()

        # If we got a database-specific error, re-raise it as our
        # generic error.  Let other exceptions through unchanged.
        if type_ is not None and issubclass(type_, sqlite3.Error):
            raise JSAProcError(str(value))
Exemplo n.º 17
0
def fetch_cadc_file_info(filename):
    """Retrieve information about a file in the JCMT archive at CADC.

    This routine works in the same way as fetch_cadc_file but makes
    an HTTP HEAD request instead of an HTTP GET request.
    """

    try:
        (args, kwargs) = _prepare_cadc_request(filename)

        kwargs['allow_redirects'] = True

        r = requests.head(*args, **kwargs)

        if r.status_code == 404:
            return None

        # Check if its worked. (raises error if not okay)
        r.raise_for_status()

        return r.headers

    except RequestException as e:
        raise JSAProcError('Error fetching CADC file info: ' + str(e))
Exemplo n.º 18
0
def run_a_job(job_id, db=None, force=False):
    """
    Run the JSA processing of the given job_id (integer).

    By default it will look in the database determined by the JSA_proc
    config. Optionally a database object can be given for testing
    purposes.

    """

    if not db:
        # Get link to database
        db = get_database()

    logger.info('About to run job %i', job_id)

    try:
        # Change status of job to Running, raise an error if not currently in
        # WAITING state.
        db.change_state(job_id,
                        JSAProcState.RUNNING,
                        'Job is about to be run on host {0}'.format(
                            gethostname().partition('.')[0]),
                        state_prev=(None if force else JSAProcState.WAITING))

    except NoRowsError:
        # If the job was not in the WAITING state, it is likely that another
        # process is also trying to run it.  Trap the error so that the
        # ErrorDecorator does not put the job into the ERROR state as that
        # will cause the other process to fail to set the job to PROCESSED.
        logger.error('Job %i cannot be run because it is not waiting', job_id)
        return

    # Input file_list -- this should be better? or in jsawrapdr?

    input_dir = get_input_dir(job_id)
    input_file_list_path = os.path.join(input_dir, input_list_name)
    if not os.path.exists(input_file_list_path):
        raise JSAProcError('Input file list %s not found for job_id %i' %
                           (input_file_list_path, job_id))

    # Check every file on input_file list exists.
    inputfl = open(input_file_list_path, 'r')

    for input_file in inputfl:
        input_file = input_file.strip()
        if not os.path.isfile(input_file):

            # If a file is missing, get log.
            logstring = 'Input file %s for job %i has gone missing' % (
                input_file, job_id)
            logger.error(logstring)
            logs = db.get_logs(job_id)
            states = [i.state_new for i in logs]

            # If it has only been in the state MISSING twice before, then try
            # again.
            if states.count(JSAProcState.MISSING) <= 2:
                logstring += ': moving to missing.'
                logger.warning(
                    'Moving job %i to state MISSING due to '
                    'missing file(s) %s', job_id, input_file)
                db.change_state(job_id,
                                JSAProcState.MISSING,
                                logstring,
                                state_prev=JSAProcState.RUNNING)
                return job_id

            else:
                # If it has been in the missing STATE more than two times,
                # give up and move it into ERROR state to be fixed manually.
                logstring += ': moving to error.'
                logger.info(
                    'Moving job %s to state ERROR due to missing'
                    ' file(s).', job_id)
                inputfl.close()
                raise JSAProcError(
                    'Input file %s for job %i has gone missing.' %
                    (input_file, job_id))

    inputfl.close()
    logger.debug('All input files found for job %s.', job_id)

    # Get the mode and drparameters of the job.
    job = db.get_job(id_=job_id)
    mode = job.mode
    drparameters = job.parameters

    # Get the starlink to be used from the task table.
    starpath = None
    version = None
    command_run = None
    raw_output = None
    log_ingest_command = None
    try:
        task_info = db.get_task_info(job.task)
        starpath = task_info.starlink_dir
        version = task_info.version
        command_run = task_info.command_run
        raw_output = task_info.raw_output
        log_ingest_command = task_info.log_ingest
    except NoRowsError:
        # If the task doesn't have task info, leave "starpath" as None
        # so that jsawrapdr_run uses the default value from the configuration
        # file.
        pass

    # Run the processing job.
    logger.debug('Launching jsawrapdr: mode=%s, parameters=%s', mode,
                 drparameters)

    # First of all remove the output files and log_files from the database.
    db.set_log_files(job_id, [])
    db.set_output_files(job_id, [])

    log = jsawrapdr_run(job_id,
                        input_file_list_path,
                        mode,
                        drparameters,
                        cleanup='cadc',
                        location='JAC',
                        starlink_dir=starpath,
                        persist=True,
                        version=version,
                        command_run=command_run,
                        raw_output=raw_output)

    # Create list of output files.
    logger.debug('Preparing list of output files')
    output_files = get_output_files(job_id)

    # write output files to table
    logger.debug('Storing list of output files')
    db.set_output_files(job_id, output_files)

    # Create list of output log files.
    logger.debug('Preparing list of output log files (log.*)')
    log_files = get_output_log_files(job_id)

    # Write output log files to table.
    logger.debug('Storing list of output log files')
    db.set_log_files(job_id, log_files)

    # If a log ingest command is set, run it here.
    if log_ingest_command:
        logger.debug('Will try and ingest log files')
        try:
            with open_log_file(job.id, 'ingest_log') as logingest_log:
                subprocess.check_call(
                    [log_ingest_command, str(job_id)],
                    shell=False,
                    cwd='/tmp',
                    stdout=logingest_log,
                    stderr=subprocess.STDOUT,
                    preexec_fn=restore_signals)
        except subprocess.CalledProcessError as e:
            logger.exception('Custom log ingest failed ' 'for job %i', job.id)
            db.change_state(job.id,
                            JSAProcState.ERROR,
                            'Custom log ingestion failed',
                            state_prev=JSAProcState.RUNNING)

    # If task begins with hpx, get tiles from list of output_files
    # and write to tile table in db.
    if hpx_task.search(job.task):
        logger.debug('Storing list of output tiles for HPX job ' + str(job_id))
        tiles = hpx_tiles_from_filenames([x.filename for x in output_files])
        db.set_tilelist(job_id, tiles)
        logger.debug('Job ' + str(job_id) + ' produced output on tiles ' +
                     ', '.join(str(i) for i in tiles))

    # Change state of job.
    db.change_state(job_id,
                    JSAProcState.PROCESSED,
                    'Job has been successfully processed',
                    state_prev=JSAProcState.RUNNING)

    logger.info('Done running job %i', job_id)

    return job_id
Exemplo n.º 19
0
def get_parents(tile, parenttask, exclude_pointing_jobs=False,
                science_obs_only=False, pointings_only=False):
    """
    get parent jobs for the requested tile and coaddtask,
    using the parettask to look for jobs.
    required parameters:

    Raises a  JSAProcError if there are no parent jobs that fit.
    tile (int)
    Tile number to perform coadd on.

    parenttask (string)
    input task name to look for jobs for.

    """
    # Find all jobs from the parent task which include the requested tile and
    # 1) Have a JSAQA State that is not BAD or INVALID
    # 2) Have not been marked as deleted.
    logger.debug(
        'Finding all jobs in task %s that fall on tile %i',
        parenttask, tile)

    db = get_database()
    qa_state = [JSAQAState.GOOD,
                JSAQAState.QUESTIONABLE,
                JSAQAState.UNKNOWN]

    obsquery = {'omp_status': Not(list(OMPState.STATE_NO_COADD))}
    if science_obs_only:
        obsquery['obstype'] = {'science'}
    if pointings_only:
        obsquery['obstype'] = {'pointing'}
    # Get the parent jobs.
    parentjobs = db.find_jobs(tiles=[tile],
                              task=parenttask,
                              qa_state=qa_state,
                              state=Not([JSAProcState.DELETED]),
                              obsquery=obsquery)

    parentjobs = [p.id for p in parentjobs]

    # Do some other queries to give the user info about what is not being
    # included.
    excludedjobs_ompstatus = db.find_jobs(
        tiles=[tile],
        task=parenttask,
        qa_state=qa_state,
        state=Not([JSAProcState.DELETED]),
        obsquery={'omp_status': OMPState.STATE_NO_COADD}
    )

    if science_obs_only or exclude_pointing_jobs:
        obsquery = {
            'obstype': 'pointing',
            'omp_status': Not(list(OMPState.STATE_NO_COADD)),
        }
        state = Not([JSAProcState.DELETED])
        excludedjobs_pointings = db.find_jobs(tiles=[tile],
                                              task=parenttask,
                                              qa_state=qa_state,
                                              state=state,
                                              obsquery=obsquery)

        # If it was requested to exclude entirely any job containing a
        # pointing:
        if exclude_pointing_jobs and len(excludedjobs_pointings) > 0:
            logger.debug('Tile %i contains pointing obs.', tile)
            raise JSAProcError('Pointings fall on this tile.')

    # Log information about which tasks where excluded.
    # TODO: check what logger level is being used before going through for
    # loops.
    logger.debug(
        '%i jobs in task %s fall on tile %i with appropriate QA States'
        ', OMP States and obstype states', len(parentjobs), parenttask, tile)

    if len(excludedjobs_ompstatus) > 0:
        logger.debug(
            '%i jobs were excluded due to wrong OMP status',
            len(excludedjobs_ompstatus))
        for i in excludedjobs_ompstatus:
            omp_status = db.get_obs_info(i.id)[0].omp_status
            logger.debug(
                'Job %i NOT INCLUDED (omp status of %s)',
                i.id, OMPState.get_name(omp_status))

    if science_obs_only:
        if len(excludedjobs_pointings) > 0:
            logger.debug(
                '%i additional jobs were excluded as pointings',
                len(excludedjobs_pointings))
            for i in excludedjobs_pointings:
                logger.debug('Job %i NOT INCLUDED (pointing)', i.id)

    if len(parentjobs) == 0:
        logger.debug('Tile %i has no acceptable parent jobs', tile)

        raise JSAProcError('No acceptable observations.')

    # Return the parent jobs
    return parentjobs
Exemplo n.º 20
0
    def check_files(self, filenames):
        """Check whether the given files have been ingested into CAOM-2.

        Returns a boolean list corresponding to the input list.
        """

        # Do we have too many filenames to query at once?
        if len(filenames) > 10:
            result = []

            for part in _partition_list(filenames, 10):
                result.extend(self.check_files(part))

            return result

        uris = {}

        for filename in filenames:
            # CADC now uses file IDs *with* the extension in the JCMT archive.
            fileid = filename

            if not valid_fileid.match(fileid):
                raise JSAProcError('Invalid file ID {0}'.format(fileid))

            uris[filename] = 'ad:JCMT/{0}'.format(fileid)

        logger.debug('SELECT uri, COUNT(*) FROM caom2.Artifact '
                     'WHERE uri IN (' +
                     ', '.join(['\'{0}\''.format(x)
                                for x in uris.values()]) + ') GROUP BY uri')

        table = self.tap.query(
            'SELECT uri, COUNT(*) FROM caom2.Artifact '
            'WHERE uri IN (' +
            ', '.join(['\'{0}\''.format(x)
                       for x in uris.values()]) + ') GROUP BY uri')

        if table is None:
            raise JSAProcError('Failed TAP query for files in CAOM-2')

        counts = {}

        for row in table:
            counts[row[0]] = row[1]

        result = []

        for filename in filenames:
            uri = uris[filename]

            if uri not in counts:
                result.append(False)
                continue

            count = counts[uri]

            if count == 0:
                result.append(False)

            elif count == 1:
                result.append(True)

            elif count > 1:
                logger.warning('Received unexpected artifact count')
                result.append(True)

            else:
                raise JSAProcError('Received unexpected artifact count')

        return result
Exemplo n.º 21
0
def _perform_ingestion(job_id, db, command_ingest=None):
    """Private function to peform the ingestion.

    Runs under the ErrorDecorator to capture errors.  Sets the job state
    to COMPLETE if it finishes successfully, or ERROR otherwise.
    """

    logger.debug('Preparing to ingest ouput for job {0}'.format(job_id))

    output_dir = get_output_dir(job_id)

    logger.debug('Checking that output files are present for ingestion')
    try:
        output_files = db.get_output_files(job_id)
        for filename in output_files:
            if not os.path.exists(os.path.join(output_dir, filename)):
                raise JSAProcError(
                    'Output file {0} is missing'.format(filename))
    except NoRowsError:
        raise JSAProcError('Job has no output files to ingest')

    with open_log_file(job_id, 'ingestion') as log:
        try:
            if command_ingest is None:
                scratch_dir = make_temp_scratch_dir(job_id)
                logger.debug('Using scratch directory %s', scratch_dir)

                logger.debug('Invoking jsaingest, log file: %s', log.name)

                subprocess.check_call(
                    [
                        'jsaingest',
                        '--ingest',
                        '--collection', 'JCMT',
                        '--indir', output_dir,
                    ],
                    shell=False,
                    cwd=scratch_dir,
                    stdout=log,
                    stderr=subprocess.STDOUT,
                    preexec_fn=restore_signals)

            else:
                logger.debug(
                    'Invoking custom ingestion script %s, log file: %s',
                    command_ingest, log.name)

                subprocess.check_call(
                    [
                        command_ingest,
                        '--transdir', output_dir,
                    ],
                    shell=False,
                    cwd='/tmp',
                    stdout=log,
                    stderr=subprocess.STDOUT,
                    preexec_fn=restore_signals)

            db.change_state(job_id, JSAProcState.COMPLETE,
                            'Ingestion completed successfully',
                            state_prev=JSAProcState.INGESTING)

            logger.info('Done ingesting ouput for job {0}'.format(job_id))

        except subprocess.CalledProcessError as e:
            # Attempt to get the first message beginning with ERROR from
            # the log file.

            # Go back to the start of the log and read in the data.
            log.seek(0)
            content = '\n'.join(log.readlines())
            errorline = content[content.find('\nERROR '):].split('\n')[1]

            db.change_state(job_id, JSAProcState.ERROR,
                            'Ingestion failed\n' + errorline)

            logger.exception('Error during ingestion of job %i', job_id)
Exemplo n.º 22
0
def _ingest_raw_observation(obsid, db, dry_run=False):
    """Perform raw ingestion of an observation.

    This internal function requires an OMP database object with write
    access to the JCMT database.  If the ingestion is successful then
    the "last_caom_mod" timestamp for the observation will be updated
    in the COMMON table of the JCMT database.

    Returns True on success, False on failure.
    """

    logger.debug('Starting raw ingestion of OBSID %s', obsid)

    # Determine the date components which we can then use to create the
    # log directory.
    m = obsid_date.search(obsid)
    if not m:
        logger.error('Cannot parser OBSID %s to obtain date', obsid)
        raise JSAProcError('Cannot find date in OBSID {0}'.format(obsid))
    date = m.group(1)
    year = date[0:4]
    month = date[4:6]
    day = date[6:]
    logger.debug('Parsed OBSID, date: %s/%s/%s', month, day, year)

    # Prepare scratch directory.
    if not dry_run:
        scratch_dir = make_misc_scratch_dir('rawingest')
        logger.info('Working directory: %s', scratch_dir)
    else:
        scratch_dir = None

    # Prepare log directory and file name.
    if not dry_run:
        log_dir = os.path.join(get_misc_log_dir('rawingest'), year, month, day)
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        logger.info('Log directory: %s', log_dir)
        log_file = os.path.join(log_dir, '{0}.log'.format(obsid))
        logger.debug('Log file: %s', log_file)
    else:
        log_file = 'DRY_RUN_MODE'

    command = [
        'jsaraw',
        '--collection',
        'JCMT',
        '--obsid',
        obsid,
        '--verbose',
    ]

    try:
        if not dry_run:
            # Use context-manager to open a log file to store the (console)
            # output from the jsaraw program.
            with open(log_file, 'w') as log:
                logger.info('Running %s for OBSID %s', command[0], obsid)
                subprocess.check_call(command,
                                      shell=False,
                                      cwd=scratch_dir,
                                      stdout=log,
                                      stderr=subprocess.STDOUT,
                                      preexec_fn=restore_signals)

                # On success (check_call didn't raise an exception), set the
                # "last_caom_mod" timestamp in the database.
                logger.info('Updating ingestion timestamp in the database')
                db.set_last_caom_mod(obsid)

        else:
            logger.info('Would have run: "%s" (DRY RUN)', ' '.join(command))

    except subprocess.CalledProcessError as e:
        logger.exception('Error during CAOM-2 ingestion')

        try:
            logger.info('Anulling ingestion timestamp in the database')
            db.set_last_caom_mod(obsid, set_null=True)
        except:
            logger.exception('Error marking ingestion date as NULL')

        return False

    except:
        logger.exception('Error marking ingestion date')

        return False

    finally:
        if not dry_run:
            logger.debug('Deleting scratch directory')
            shutil.rmtree(scratch_dir)

    return True
Exemplo n.º 23
0
 def error(self, message):
     raise JSAProcError('Failed to parse CADC parameters: ' + message)
Exemplo n.º 24
0
def _fetch_job_output(job_id, db, force=False, dry_run=False):
    """Private function to perform retrieval of job output files from CADC.
    """

    # Check we have sufficient disk space for fetching to occur.
    output_space = get_output_dir_space()
    required_space = float(get_config().get('disk_limit', 'fetch_min_space'))

    if output_space < required_space and not force:
        logger.warning('Insufficient disk space: %f / %f GiB required',
                       output_space, required_space)
        return

    logger.info('About to retreive output data for job %i', job_id)

    # Change state from INGEST_QUEUE to INGEST_FETCH.
    if not dry_run:
        try:
            db.change_state(
                job_id,
                JSAProcState.INGEST_FETCH,
                'Output data are being retrieved',
                state_prev=(None if force else JSAProcState.INGEST_QUEUE))
        except NoRowsError:
            logger.error(
                'Job %i cannot have output data fetched'
                ' as it not waiting for reingestion', job_id)
            return

    # Check state of output files.
    output_dir = get_output_dir(job_id)
    output_files = db.get_output_files(job_id, with_info=True)
    missing_files = []

    for file in output_files:
        filename = file.filename
        filepath = os.path.join(output_dir, filename)

        if os.path.exists(filepath):
            # If we still have the file, check its MD5 sum is correct.
            if file.md5 is None:
                logger.warning('PRESENT without MD5 sum: %s', filename)
            elif file.md5 == get_md5sum(filepath):
                logger.debug('PRESENT: %s', filename)
            else:
                raise JSAProcError(
                    'MD5 sum mismatch for existing file {0}'.format(filename))

        else:
            # Otherwise add it to the list of missing files.
            logger.debug('MISSING: %s', filename)
            missing_files.append(file)

    # Are there any files we need to retrieve?
    if missing_files:
        for file in missing_files:
            filename = file.filename
            filepath = os.path.join(output_dir, filename)

            if not dry_run:
                if os.path.exists(output_dir):
                    logger.debug('Directory %s already exists', output_dir)
                else:
                    logger.debug('Making directory %s', output_dir)
                    os.makedirs(output_dir)

                logger.info('Fetching file %s', filename)
                fetch_cadc_file(filename, output_dir, suffix='')

                if file.md5 is None:
                    logger.warning('MD5 sum missing: %s', filename)
                elif file.md5 == get_md5sum(filepath):
                    logger.debug('MD5 sum OK: %s', filename)
                else:
                    raise JSAProcError(
                        'MD5 sum mismatch for fetched file {0}'.format(
                            filename))
            else:
                logger.info('Skipping fetch of %s (DRY RUN)', filename)

    else:
        logger.info('All output files are already present')

    # Finally set the state to INGESTION.
    if not dry_run:
        db.change_state(job_id,
                        JSAProcState.INGESTION,
                        'Output data have been retrieved',
                        state_prev=JSAProcState.INGEST_FETCH)
Exemplo n.º 25
0
def jsawrapdr_run(job_id, input_file_list, mode, drparameters,
                  cleanup='cadc', location='JAC', persist=False,
                  jsawrapdr=None, starlink_dir=None,
                  version=None, command_run=None,
                  raw_output=None):
    """
    Execute jsawrapdr script from python.

    This function calls jsawrapdr with following options:

    jsawrapdr --outdir=configbase/scratch/$job_id
              --inputs=input_file_list
              --id = jac-$job_id
              --mode=$mode
              --drparameters=$drparameters
              --cleanup=$cleanup (cadc by default)
              --location=$location (JAC by default)
              --fileversion=$version (if not None)
              --drcommand=$command_run (if not None)

         if persist is True, then it adds the flag:
              -persist

         if raw_output is True, it adds the option:
              --rawoutput

    Args:

      job_id (int): Job identifier from jsaproc database.

      input_file_list (str): List of files (with extensions and full
        path).

      mode (str): Can be 'night', 'obs', 'public' or 'project'.

      drparameters (str):

      cleanup (str, optional): Type of cleanup. Can be one of
        'cadc'|'none'|'all', defaults to 'cadc'.

      persist (bool, optional): Defaults to False If persist is turned
        on, then dpCapture will copy acceptable products to the
        default output directory. Otherwise it won't (used for
        debugging purposes). The output directory is determined by
        jsa_proc.admin.directories 'get_output_dir' for the given
        job_id.

      location (str, optional): One of |'cadc'|'JAC'| (NOT CURRENTLY
        IMPLEMENTED, default is 'JAC')


      jsawrapdr (str, optional): The path to jsawrapdr. If not given,
        the one in configured starlink will be used.

      starlink_dir (str, optional): The path of a starlink install to
        use. If not given, the one found in the configuration file will be
        used.

      version: CADC file name "version" or None to use default.

      command_run: custom "run" command to be passed to jsawrapdr.

    Returns:
      str: The filename (including path) of the logfile.

    """

    # Get log directory.  Note that opening a log file in this
    # directory using open_log_file will ensure that it exists.
    log_dir = get_log_dir(job_id)

    # Prepare scratch directory.
    scratch_dir = make_temp_scratch_dir(job_id)

    # Get output directory name.
    out_dir = get_output_dir(job_id)

    # If output dir currently exists, delete the directory.
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)

    # Make the "transfer" directory in advance.  (This saves dpCapture
    # or another copying routine from having to do so.)
    os.makedirs(out_dir)

    # Find paths to starlink, jsawrapdr and orac_dr.
    config = get_config()

    if starlink_dir is None:
        starpath = config.get('job_run', 'starpath')
    else:
        starpath = starlink_dir
    if not jsawrapdr:
        jsawrapdr = os.path.join(starpath, 'Perl', 'bin', 'jsawrapdr')
    orac_dir = os.path.join(starpath, 'bin', 'oracdr', 'src')

    # Set thejac recipe id.
    jacid = 'jac-'+str(job_id)

    # Collect the jsawrapdr arguments.
    jsawrapdrcom = [jsawrapdr,
                    '--debugxfer',
                    '--outdir='+scratch_dir,
                    '--inputs='+input_file_list,
                    '--id='+jacid,
                    '--mode='+mode,
                    '--cleanup='+cleanup,
                    '--drparameters='+drparameters]
    if persist:
        jsawrapdrcom.append('-persist')
        jsawrapdrcom.append('--transdir='+out_dir)

    if raw_output:
        jsawrapdrcom.append('--rawoutput')

    if version is not None:
        jsawrapdrcom.append('--fileversion={0}'.format(version))

    if command_run is not None:
        jsawrapdrcom.append('--drcommand={0}'.format(command_run))

    # Set up the environment for running jsawrapdr.
    jsa_env = os.environ.copy()
    jsa_env = setup_starlink(starpath, jsa_env)

    # Add in the LOGDIR
    jsa_env['ORAC_LOGDIR'] = log_dir

    # Ensure that we delete the results of previous log.* files the ORAC_LOGDIR if they exist.
    if os.path.exists(log_dir):
        calculation_logs = glob.glob(os.path.join(log_dir, 'log.*'))
        for cl in calculation_logs:
            os.remove(cl)

    # Open a log file and run jsawrapdr while saving output to log.
    with open_log_file(job_id, 'jsawrapdr') as log:

        # Save the log file name.
        log_name = log.name

        # Run jsawrapdr.
        retcode = subprocess.call(jsawrapdrcom, env=jsa_env, bufsize=1,
                                  stdout=log, stderr=subprocess.STDOUT,
                                  preexec_fn=restore_signals)

    # Handle jsawrapdr errors.
    if retcode != 0:
        errormessage = 'jsawrapdr exited with Retcode %i ' % (retcode)

        # Find the first ORAC error message in the jsawrapdr log.
        jsalogfile = open(log_name, 'r')
        lines = jsalogfile.read()
        jsalogfile.close()
        result = re.search(r'.*(STDERR:\s*.*)$', lines, re.DOTALL)
        if result:
            firsterror = result.group(1).split('\n')[1]

            # Insert the ORAC error at the start of the error message.
            if firsterror:
                errormessage = 'ORAC ERROR: ' + firsterror + '.\n' + \
                               errormessage

        # Raise the error.
        raise JSAProcError(errormessage)

    return log_name