def put_cadc_file(filename, input_directory, ad_stream): """Put the given file into the CADC archive. The CADC AD "stream" for the PUT request must be given. Raises a JSAProcError on failure. """ (args, kwargs) = _prepare_cadc_request(filename) r = None try: with open(os.path.join(input_directory, filename), 'rb') as f: kwargs['data'] = f kwargs['headers'] = {'X-CADC-Stream': ad_stream} r = requests.put(*args, **kwargs) r.raise_for_status() if r.status_code in (200, 201): return except RequestException as e: text = 'no text received' if r is None else r.text raise JSAProcError('Error putting CADC file: {0}: {1}'.format( str(e), text)) raise JSAProcError('Putting CADC file gave bad status: {0}: {1}'.format( r.status_code, r.text))
def get_output_files(job_id): """ Get the current list of output files from the output directory. This command trusts that whatever is in the output directory at the time it is called is the correct list of output files. parameter: job_id, integer returns: list of JSAProcFileInfo objects. Each object contains a plain filename, with no path attached. """ # find output_dir output_dir = get_output_dir(job_id) # Check it exists and is a directory: raise error if not if not os.path.exists(output_dir) or not os.path.isdir: raise JSAProcError( 'The output directory %s for job %i does not exist' % (output_dir, job_id)) # Get list of files in directory: contents = os.listdir(output_dir) return [JSAProcFileInfo(x, get_md5sum(os.path.join(output_dir, x))) for x in contents]
def assemble_input_data_for_job(job_id, input_file_list): """ This routine ensure that all the input data in the input table is available for running a job. It will check to see if the data is present in either a) the /jcmtdata tree, or b) the input directory for this job. If it is not present it will download the data from CADC into the input directory. It will create the input directory if not present. parameters; job_id: integer, id of job in job database. input_file_list: list of strings. iterable of strings, each string being the name of file. filenames must not include suffix. return: name of file output directory containing one filename per string, for every file. """ # Get full path to input directory and make it if it doesn't exist. input_directory = setup_input_directory(job_id) # For each file, check if its already in JAC data store, or input # directory. Download from CADC if its not. Check downloaded files # are valid hds. files_list = [] for f in input_file_list: filepath = file_in_jac_data_dir(f) if filepath: files_list.append(filepath) else: filepath = file_in_dir(f, input_directory) if filepath: files_list.append(filepath) else: filepath = fetch_cadc_file(f, input_directory) valid = valid_hds(filepath) if not valid: # Move invalid file to different directory and raise an # error. invalid_dir = setup_invalid_dir(input_directory) invalid_file = os.path.join(invalid_dir, os.path.split(filepath)[1]) shutil.move(filepath, invalid_file) raise JSAProcError( 'Downloaded file %s fails hds validation' ' Moved to %s' % (filepath, invalid_file)) else: files_list.append(filepath) # Return list of files with full paths. return files_list
def get_jac_data_dir(filename): """Guess directory name for a given filename. Given a bare raw data filename, return a list of standardized directory names giving where that file should be located. """ m = scuba2_file.match(filename) if m: (subarray, date, obsnum) = m.groups() path1 = os.path.join('/jcmtdata/raw/scuba2', subarray, date, obsnum) path2 = os.path.join('/jcmtcal/scuba2', subarray, date, obsnum) return (path1, path2) m = acsis_file.match(filename) if m: (date, obsnum) = m.groups() if date > '20061000': path1 = os.path.join('/jcmtdata/raw/acsis/spectra', date, obsnum) path2 = os.path.join('/jcmtcal/acsis', date, obsnum) return (path1, path2) else: year = date[0:4] path1 = os.path.join('/jcmtdata/raw/acsis-das/converted', year, date, obsnum) return (path1,) raise JSAProcError('Filename {0} does not match ' 'an expected pattern'.format(filename))
def obsids_by_pattern(self, pattern, with_productid=False): """Retrueve list of obsids matching a given pattern. The pattern should be in lower case. """ result = [] table = self.tap.query( 'SELECT lower(Observation.observationID), Plane.productID ' 'FROM caom2.Observation as Observation ' 'JOIN caom2.Plane as Plane ON Observation.obsID = Plane.obsID ' 'WHERE ( Observation.collection = \'JCMT\' ' 'AND lower(Observation.observationID) LIKE \'{0}\' ' 'AND Plane.calibrationLevel = 0 ' ')'.format(pattern)) if table is None: raise JSAProcError( 'Failed TAP query for observation ID like {0}'.format(pattern)) for (obsid, productid) in table: if with_productid: result.append((obsid, productid)) else: result.append(obsid) return result
def get_info(cls, state): """Return a StateInfo object describing the state. Raises JSAProcError if the state does not exist. """ try: return cls._info[state] except KeyError: raise JSAProcError('Unknow state code {0}'.format(state))
def get_name(cls, state): """Return the human-readable name of the state. Raises JSAProcError if the state does not exist. """ try: return cls._info[state].name except KeyError: raise JSAProcError('Unknown state code {0}'.format(state))
def files_by_pattern(self, pattern): """Retrieve list of files matching a given pattern. """ try: r = requests.get(self.jcmt_info_url, params={'file': pattern}) r.raise_for_status() return latin_1_encode(r.text)[0].strip().split('\n') except HTTPError as e: raise JSAProcError('Error fetching CADC file list: ' + str(e))
def lookup_name(cls, name): """Return the state code corresponding to the given name. Raises JSAProcError if the state name is not recognised. Names are compared in a case-insensitive manner. """ lowername = name.lower() for (state, info) in cls._info.items(): if lowername == info.name.lower(): return state raise JSAProcError('Unknown state name {0}'.format(name))
def identifier_to_pattern(identifier, patterns): """Look for a suitable pattern for an identifier. Takes a list of (regexp, pattern) pairs. Returns the pattern substituted with the regexp match groups for the first matching regexp. """ for (regexp, pattern) in patterns: match = regexp.match(identifier) if match: return pattern.format(*match.groups()) raise JSAProcError('Pattern for "{0}" not recognised'.format(identifier))
def fetch_cadc_file(filename, output_directory, suffix='.sdf'): """ Routine which will fetch a file from CADC and save it into the output directory. It assumes the url is of the form: http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/data/pub/JCMT/s4d20130401_00001_0002 parameters; filename, string This assumes a filename without extension or path. output_directory, string Path to save file to. suffix: additional suffix to be added to the filename before saving to the output directory. (string, default: ".sdf") Will raise an JSAProcError if it can't connect. Returns name of file with path """ # Local name to save to (requests automatically decompresses, so # don't need the .gz). local_file = filename + suffix output_file_path = os.path.join(output_directory, local_file) try: (args, kwargs) = _prepare_cadc_request(filename) # Connect with stream=True for large files. kwargs['stream'] = True r = requests.get(*args, **kwargs) # Check if its worked. (raises error if not okay) r.raise_for_status() # write out to a file in the requested output directory with open(output_file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): f.write(chunk) except RequestException as e: raise JSAProcError('Error fetching CADC file: ' + str(e)) return output_file_path
def _get_dir(type_, job_id): if not isinstance(job_id, int): raise JSAProcError('Cannot determine directory ' 'for non-integer job identifier') config = get_config() basedir = config.get('directories', type_) # Turn the job ID into a decimal string of at least 9 # digits, then create subdirectories by removing the last 6 # and then the last 3 digits. This means that we retain the # full length name in the final directory (unlike Git) to # try to prevent accidental collisions if the directories are # manipulated manually. The digits are counted back from the # end of the decimal string so that any digits in excess of # the fixed 9 end up in the first component. decimal = '{0:09d}'.format(job_id) return os.path.join(basedir, decimal[:-6], decimal[:-3], decimal)
def search_file(self, pattern, archive='JCMT', timeout=300): result = [] table = self.tap.query('SELECT fileID, contentMD5 ' 'FROM archive_files ' 'WHERE (' 'archiveName = \'{}\' ' 'AND fileID LIKE \'{}\'' ')'.format(archive, pattern), timeout=timeout) if table is None: raise JSAProcError( 'Failed TAP query for AD files like {}'.format(pattern)) for (id_, md5) in table: result.append(ADFileInfo(id_, md5)) return result
def get_config(): """Read the configuration file. Returns a SafeConfigParser object. """ global config if config is None: dir = get_home() file = os.path.join(dir, config_file) if not os.path.exists(file): raise JSAProcError('Config file {0} doesn\'t exist'.format(file)) config = SafeConfigParser() config.read(file) return config
def get_output_log_files(job_id): """ Get the current list of output log.* files from the log directory. This command trusts whatever is in the log directory and starts with log.* is the correct list of output log files. Returns: list of bare file names """ log_dir = get_log_dir(job_id) if not os.path.exists(log_dir) or not os.path.isdir: raise JSAProcError( 'The log directory %s for job %i does not exist.' % (log_dir, job_id)) pattern = re.compile('log.*') logs = [i for i in os.listdir(log_dir) if pattern.match(i)] return logs
def __exit__(self, type_, value, tb): """Context manager block exit method. If the block exited cleanly, commit, otherwise rollback the current transaction. Also closes the cursor object. """ if type_ is None: self._conn.commit() else: self._conn.rollback() self._cursor.close() del self._cursor self._lock.release() # If we got a database-specific error, re-raise it as our # generic error. Let other exceptions through unchanged. if type_ is not None and issubclass(type_, sqlite3.Error): raise JSAProcError(str(value))
def fetch_cadc_file_info(filename): """Retrieve information about a file in the JCMT archive at CADC. This routine works in the same way as fetch_cadc_file but makes an HTTP HEAD request instead of an HTTP GET request. """ try: (args, kwargs) = _prepare_cadc_request(filename) kwargs['allow_redirects'] = True r = requests.head(*args, **kwargs) if r.status_code == 404: return None # Check if its worked. (raises error if not okay) r.raise_for_status() return r.headers except RequestException as e: raise JSAProcError('Error fetching CADC file info: ' + str(e))
def run_a_job(job_id, db=None, force=False): """ Run the JSA processing of the given job_id (integer). By default it will look in the database determined by the JSA_proc config. Optionally a database object can be given for testing purposes. """ if not db: # Get link to database db = get_database() logger.info('About to run job %i', job_id) try: # Change status of job to Running, raise an error if not currently in # WAITING state. db.change_state(job_id, JSAProcState.RUNNING, 'Job is about to be run on host {0}'.format( gethostname().partition('.')[0]), state_prev=(None if force else JSAProcState.WAITING)) except NoRowsError: # If the job was not in the WAITING state, it is likely that another # process is also trying to run it. Trap the error so that the # ErrorDecorator does not put the job into the ERROR state as that # will cause the other process to fail to set the job to PROCESSED. logger.error('Job %i cannot be run because it is not waiting', job_id) return # Input file_list -- this should be better? or in jsawrapdr? input_dir = get_input_dir(job_id) input_file_list_path = os.path.join(input_dir, input_list_name) if not os.path.exists(input_file_list_path): raise JSAProcError('Input file list %s not found for job_id %i' % (input_file_list_path, job_id)) # Check every file on input_file list exists. inputfl = open(input_file_list_path, 'r') for input_file in inputfl: input_file = input_file.strip() if not os.path.isfile(input_file): # If a file is missing, get log. logstring = 'Input file %s for job %i has gone missing' % ( input_file, job_id) logger.error(logstring) logs = db.get_logs(job_id) states = [i.state_new for i in logs] # If it has only been in the state MISSING twice before, then try # again. if states.count(JSAProcState.MISSING) <= 2: logstring += ': moving to missing.' logger.warning( 'Moving job %i to state MISSING due to ' 'missing file(s) %s', job_id, input_file) db.change_state(job_id, JSAProcState.MISSING, logstring, state_prev=JSAProcState.RUNNING) return job_id else: # If it has been in the missing STATE more than two times, # give up and move it into ERROR state to be fixed manually. logstring += ': moving to error.' logger.info( 'Moving job %s to state ERROR due to missing' ' file(s).', job_id) inputfl.close() raise JSAProcError( 'Input file %s for job %i has gone missing.' % (input_file, job_id)) inputfl.close() logger.debug('All input files found for job %s.', job_id) # Get the mode and drparameters of the job. job = db.get_job(id_=job_id) mode = job.mode drparameters = job.parameters # Get the starlink to be used from the task table. starpath = None version = None command_run = None raw_output = None log_ingest_command = None try: task_info = db.get_task_info(job.task) starpath = task_info.starlink_dir version = task_info.version command_run = task_info.command_run raw_output = task_info.raw_output log_ingest_command = task_info.log_ingest except NoRowsError: # If the task doesn't have task info, leave "starpath" as None # so that jsawrapdr_run uses the default value from the configuration # file. pass # Run the processing job. logger.debug('Launching jsawrapdr: mode=%s, parameters=%s', mode, drparameters) # First of all remove the output files and log_files from the database. db.set_log_files(job_id, []) db.set_output_files(job_id, []) log = jsawrapdr_run(job_id, input_file_list_path, mode, drparameters, cleanup='cadc', location='JAC', starlink_dir=starpath, persist=True, version=version, command_run=command_run, raw_output=raw_output) # Create list of output files. logger.debug('Preparing list of output files') output_files = get_output_files(job_id) # write output files to table logger.debug('Storing list of output files') db.set_output_files(job_id, output_files) # Create list of output log files. logger.debug('Preparing list of output log files (log.*)') log_files = get_output_log_files(job_id) # Write output log files to table. logger.debug('Storing list of output log files') db.set_log_files(job_id, log_files) # If a log ingest command is set, run it here. if log_ingest_command: logger.debug('Will try and ingest log files') try: with open_log_file(job.id, 'ingest_log') as logingest_log: subprocess.check_call( [log_ingest_command, str(job_id)], shell=False, cwd='/tmp', stdout=logingest_log, stderr=subprocess.STDOUT, preexec_fn=restore_signals) except subprocess.CalledProcessError as e: logger.exception('Custom log ingest failed ' 'for job %i', job.id) db.change_state(job.id, JSAProcState.ERROR, 'Custom log ingestion failed', state_prev=JSAProcState.RUNNING) # If task begins with hpx, get tiles from list of output_files # and write to tile table in db. if hpx_task.search(job.task): logger.debug('Storing list of output tiles for HPX job ' + str(job_id)) tiles = hpx_tiles_from_filenames([x.filename for x in output_files]) db.set_tilelist(job_id, tiles) logger.debug('Job ' + str(job_id) + ' produced output on tiles ' + ', '.join(str(i) for i in tiles)) # Change state of job. db.change_state(job_id, JSAProcState.PROCESSED, 'Job has been successfully processed', state_prev=JSAProcState.RUNNING) logger.info('Done running job %i', job_id) return job_id
def get_parents(tile, parenttask, exclude_pointing_jobs=False, science_obs_only=False, pointings_only=False): """ get parent jobs for the requested tile and coaddtask, using the parettask to look for jobs. required parameters: Raises a JSAProcError if there are no parent jobs that fit. tile (int) Tile number to perform coadd on. parenttask (string) input task name to look for jobs for. """ # Find all jobs from the parent task which include the requested tile and # 1) Have a JSAQA State that is not BAD or INVALID # 2) Have not been marked as deleted. logger.debug( 'Finding all jobs in task %s that fall on tile %i', parenttask, tile) db = get_database() qa_state = [JSAQAState.GOOD, JSAQAState.QUESTIONABLE, JSAQAState.UNKNOWN] obsquery = {'omp_status': Not(list(OMPState.STATE_NO_COADD))} if science_obs_only: obsquery['obstype'] = {'science'} if pointings_only: obsquery['obstype'] = {'pointing'} # Get the parent jobs. parentjobs = db.find_jobs(tiles=[tile], task=parenttask, qa_state=qa_state, state=Not([JSAProcState.DELETED]), obsquery=obsquery) parentjobs = [p.id for p in parentjobs] # Do some other queries to give the user info about what is not being # included. excludedjobs_ompstatus = db.find_jobs( tiles=[tile], task=parenttask, qa_state=qa_state, state=Not([JSAProcState.DELETED]), obsquery={'omp_status': OMPState.STATE_NO_COADD} ) if science_obs_only or exclude_pointing_jobs: obsquery = { 'obstype': 'pointing', 'omp_status': Not(list(OMPState.STATE_NO_COADD)), } state = Not([JSAProcState.DELETED]) excludedjobs_pointings = db.find_jobs(tiles=[tile], task=parenttask, qa_state=qa_state, state=state, obsquery=obsquery) # If it was requested to exclude entirely any job containing a # pointing: if exclude_pointing_jobs and len(excludedjobs_pointings) > 0: logger.debug('Tile %i contains pointing obs.', tile) raise JSAProcError('Pointings fall on this tile.') # Log information about which tasks where excluded. # TODO: check what logger level is being used before going through for # loops. logger.debug( '%i jobs in task %s fall on tile %i with appropriate QA States' ', OMP States and obstype states', len(parentjobs), parenttask, tile) if len(excludedjobs_ompstatus) > 0: logger.debug( '%i jobs were excluded due to wrong OMP status', len(excludedjobs_ompstatus)) for i in excludedjobs_ompstatus: omp_status = db.get_obs_info(i.id)[0].omp_status logger.debug( 'Job %i NOT INCLUDED (omp status of %s)', i.id, OMPState.get_name(omp_status)) if science_obs_only: if len(excludedjobs_pointings) > 0: logger.debug( '%i additional jobs were excluded as pointings', len(excludedjobs_pointings)) for i in excludedjobs_pointings: logger.debug('Job %i NOT INCLUDED (pointing)', i.id) if len(parentjobs) == 0: logger.debug('Tile %i has no acceptable parent jobs', tile) raise JSAProcError('No acceptable observations.') # Return the parent jobs return parentjobs
def check_files(self, filenames): """Check whether the given files have been ingested into CAOM-2. Returns a boolean list corresponding to the input list. """ # Do we have too many filenames to query at once? if len(filenames) > 10: result = [] for part in _partition_list(filenames, 10): result.extend(self.check_files(part)) return result uris = {} for filename in filenames: # CADC now uses file IDs *with* the extension in the JCMT archive. fileid = filename if not valid_fileid.match(fileid): raise JSAProcError('Invalid file ID {0}'.format(fileid)) uris[filename] = 'ad:JCMT/{0}'.format(fileid) logger.debug('SELECT uri, COUNT(*) FROM caom2.Artifact ' 'WHERE uri IN (' + ', '.join(['\'{0}\''.format(x) for x in uris.values()]) + ') GROUP BY uri') table = self.tap.query( 'SELECT uri, COUNT(*) FROM caom2.Artifact ' 'WHERE uri IN (' + ', '.join(['\'{0}\''.format(x) for x in uris.values()]) + ') GROUP BY uri') if table is None: raise JSAProcError('Failed TAP query for files in CAOM-2') counts = {} for row in table: counts[row[0]] = row[1] result = [] for filename in filenames: uri = uris[filename] if uri not in counts: result.append(False) continue count = counts[uri] if count == 0: result.append(False) elif count == 1: result.append(True) elif count > 1: logger.warning('Received unexpected artifact count') result.append(True) else: raise JSAProcError('Received unexpected artifact count') return result
def _perform_ingestion(job_id, db, command_ingest=None): """Private function to peform the ingestion. Runs under the ErrorDecorator to capture errors. Sets the job state to COMPLETE if it finishes successfully, or ERROR otherwise. """ logger.debug('Preparing to ingest ouput for job {0}'.format(job_id)) output_dir = get_output_dir(job_id) logger.debug('Checking that output files are present for ingestion') try: output_files = db.get_output_files(job_id) for filename in output_files: if not os.path.exists(os.path.join(output_dir, filename)): raise JSAProcError( 'Output file {0} is missing'.format(filename)) except NoRowsError: raise JSAProcError('Job has no output files to ingest') with open_log_file(job_id, 'ingestion') as log: try: if command_ingest is None: scratch_dir = make_temp_scratch_dir(job_id) logger.debug('Using scratch directory %s', scratch_dir) logger.debug('Invoking jsaingest, log file: %s', log.name) subprocess.check_call( [ 'jsaingest', '--ingest', '--collection', 'JCMT', '--indir', output_dir, ], shell=False, cwd=scratch_dir, stdout=log, stderr=subprocess.STDOUT, preexec_fn=restore_signals) else: logger.debug( 'Invoking custom ingestion script %s, log file: %s', command_ingest, log.name) subprocess.check_call( [ command_ingest, '--transdir', output_dir, ], shell=False, cwd='/tmp', stdout=log, stderr=subprocess.STDOUT, preexec_fn=restore_signals) db.change_state(job_id, JSAProcState.COMPLETE, 'Ingestion completed successfully', state_prev=JSAProcState.INGESTING) logger.info('Done ingesting ouput for job {0}'.format(job_id)) except subprocess.CalledProcessError as e: # Attempt to get the first message beginning with ERROR from # the log file. # Go back to the start of the log and read in the data. log.seek(0) content = '\n'.join(log.readlines()) errorline = content[content.find('\nERROR '):].split('\n')[1] db.change_state(job_id, JSAProcState.ERROR, 'Ingestion failed\n' + errorline) logger.exception('Error during ingestion of job %i', job_id)
def _ingest_raw_observation(obsid, db, dry_run=False): """Perform raw ingestion of an observation. This internal function requires an OMP database object with write access to the JCMT database. If the ingestion is successful then the "last_caom_mod" timestamp for the observation will be updated in the COMMON table of the JCMT database. Returns True on success, False on failure. """ logger.debug('Starting raw ingestion of OBSID %s', obsid) # Determine the date components which we can then use to create the # log directory. m = obsid_date.search(obsid) if not m: logger.error('Cannot parser OBSID %s to obtain date', obsid) raise JSAProcError('Cannot find date in OBSID {0}'.format(obsid)) date = m.group(1) year = date[0:4] month = date[4:6] day = date[6:] logger.debug('Parsed OBSID, date: %s/%s/%s', month, day, year) # Prepare scratch directory. if not dry_run: scratch_dir = make_misc_scratch_dir('rawingest') logger.info('Working directory: %s', scratch_dir) else: scratch_dir = None # Prepare log directory and file name. if not dry_run: log_dir = os.path.join(get_misc_log_dir('rawingest'), year, month, day) if not os.path.exists(log_dir): os.makedirs(log_dir) logger.info('Log directory: %s', log_dir) log_file = os.path.join(log_dir, '{0}.log'.format(obsid)) logger.debug('Log file: %s', log_file) else: log_file = 'DRY_RUN_MODE' command = [ 'jsaraw', '--collection', 'JCMT', '--obsid', obsid, '--verbose', ] try: if not dry_run: # Use context-manager to open a log file to store the (console) # output from the jsaraw program. with open(log_file, 'w') as log: logger.info('Running %s for OBSID %s', command[0], obsid) subprocess.check_call(command, shell=False, cwd=scratch_dir, stdout=log, stderr=subprocess.STDOUT, preexec_fn=restore_signals) # On success (check_call didn't raise an exception), set the # "last_caom_mod" timestamp in the database. logger.info('Updating ingestion timestamp in the database') db.set_last_caom_mod(obsid) else: logger.info('Would have run: "%s" (DRY RUN)', ' '.join(command)) except subprocess.CalledProcessError as e: logger.exception('Error during CAOM-2 ingestion') try: logger.info('Anulling ingestion timestamp in the database') db.set_last_caom_mod(obsid, set_null=True) except: logger.exception('Error marking ingestion date as NULL') return False except: logger.exception('Error marking ingestion date') return False finally: if not dry_run: logger.debug('Deleting scratch directory') shutil.rmtree(scratch_dir) return True
def error(self, message): raise JSAProcError('Failed to parse CADC parameters: ' + message)
def _fetch_job_output(job_id, db, force=False, dry_run=False): """Private function to perform retrieval of job output files from CADC. """ # Check we have sufficient disk space for fetching to occur. output_space = get_output_dir_space() required_space = float(get_config().get('disk_limit', 'fetch_min_space')) if output_space < required_space and not force: logger.warning('Insufficient disk space: %f / %f GiB required', output_space, required_space) return logger.info('About to retreive output data for job %i', job_id) # Change state from INGEST_QUEUE to INGEST_FETCH. if not dry_run: try: db.change_state( job_id, JSAProcState.INGEST_FETCH, 'Output data are being retrieved', state_prev=(None if force else JSAProcState.INGEST_QUEUE)) except NoRowsError: logger.error( 'Job %i cannot have output data fetched' ' as it not waiting for reingestion', job_id) return # Check state of output files. output_dir = get_output_dir(job_id) output_files = db.get_output_files(job_id, with_info=True) missing_files = [] for file in output_files: filename = file.filename filepath = os.path.join(output_dir, filename) if os.path.exists(filepath): # If we still have the file, check its MD5 sum is correct. if file.md5 is None: logger.warning('PRESENT without MD5 sum: %s', filename) elif file.md5 == get_md5sum(filepath): logger.debug('PRESENT: %s', filename) else: raise JSAProcError( 'MD5 sum mismatch for existing file {0}'.format(filename)) else: # Otherwise add it to the list of missing files. logger.debug('MISSING: %s', filename) missing_files.append(file) # Are there any files we need to retrieve? if missing_files: for file in missing_files: filename = file.filename filepath = os.path.join(output_dir, filename) if not dry_run: if os.path.exists(output_dir): logger.debug('Directory %s already exists', output_dir) else: logger.debug('Making directory %s', output_dir) os.makedirs(output_dir) logger.info('Fetching file %s', filename) fetch_cadc_file(filename, output_dir, suffix='') if file.md5 is None: logger.warning('MD5 sum missing: %s', filename) elif file.md5 == get_md5sum(filepath): logger.debug('MD5 sum OK: %s', filename) else: raise JSAProcError( 'MD5 sum mismatch for fetched file {0}'.format( filename)) else: logger.info('Skipping fetch of %s (DRY RUN)', filename) else: logger.info('All output files are already present') # Finally set the state to INGESTION. if not dry_run: db.change_state(job_id, JSAProcState.INGESTION, 'Output data have been retrieved', state_prev=JSAProcState.INGEST_FETCH)
def jsawrapdr_run(job_id, input_file_list, mode, drparameters, cleanup='cadc', location='JAC', persist=False, jsawrapdr=None, starlink_dir=None, version=None, command_run=None, raw_output=None): """ Execute jsawrapdr script from python. This function calls jsawrapdr with following options: jsawrapdr --outdir=configbase/scratch/$job_id --inputs=input_file_list --id = jac-$job_id --mode=$mode --drparameters=$drparameters --cleanup=$cleanup (cadc by default) --location=$location (JAC by default) --fileversion=$version (if not None) --drcommand=$command_run (if not None) if persist is True, then it adds the flag: -persist if raw_output is True, it adds the option: --rawoutput Args: job_id (int): Job identifier from jsaproc database. input_file_list (str): List of files (with extensions and full path). mode (str): Can be 'night', 'obs', 'public' or 'project'. drparameters (str): cleanup (str, optional): Type of cleanup. Can be one of 'cadc'|'none'|'all', defaults to 'cadc'. persist (bool, optional): Defaults to False If persist is turned on, then dpCapture will copy acceptable products to the default output directory. Otherwise it won't (used for debugging purposes). The output directory is determined by jsa_proc.admin.directories 'get_output_dir' for the given job_id. location (str, optional): One of |'cadc'|'JAC'| (NOT CURRENTLY IMPLEMENTED, default is 'JAC') jsawrapdr (str, optional): The path to jsawrapdr. If not given, the one in configured starlink will be used. starlink_dir (str, optional): The path of a starlink install to use. If not given, the one found in the configuration file will be used. version: CADC file name "version" or None to use default. command_run: custom "run" command to be passed to jsawrapdr. Returns: str: The filename (including path) of the logfile. """ # Get log directory. Note that opening a log file in this # directory using open_log_file will ensure that it exists. log_dir = get_log_dir(job_id) # Prepare scratch directory. scratch_dir = make_temp_scratch_dir(job_id) # Get output directory name. out_dir = get_output_dir(job_id) # If output dir currently exists, delete the directory. if os.path.exists(out_dir): shutil.rmtree(out_dir) # Make the "transfer" directory in advance. (This saves dpCapture # or another copying routine from having to do so.) os.makedirs(out_dir) # Find paths to starlink, jsawrapdr and orac_dr. config = get_config() if starlink_dir is None: starpath = config.get('job_run', 'starpath') else: starpath = starlink_dir if not jsawrapdr: jsawrapdr = os.path.join(starpath, 'Perl', 'bin', 'jsawrapdr') orac_dir = os.path.join(starpath, 'bin', 'oracdr', 'src') # Set thejac recipe id. jacid = 'jac-'+str(job_id) # Collect the jsawrapdr arguments. jsawrapdrcom = [jsawrapdr, '--debugxfer', '--outdir='+scratch_dir, '--inputs='+input_file_list, '--id='+jacid, '--mode='+mode, '--cleanup='+cleanup, '--drparameters='+drparameters] if persist: jsawrapdrcom.append('-persist') jsawrapdrcom.append('--transdir='+out_dir) if raw_output: jsawrapdrcom.append('--rawoutput') if version is not None: jsawrapdrcom.append('--fileversion={0}'.format(version)) if command_run is not None: jsawrapdrcom.append('--drcommand={0}'.format(command_run)) # Set up the environment for running jsawrapdr. jsa_env = os.environ.copy() jsa_env = setup_starlink(starpath, jsa_env) # Add in the LOGDIR jsa_env['ORAC_LOGDIR'] = log_dir # Ensure that we delete the results of previous log.* files the ORAC_LOGDIR if they exist. if os.path.exists(log_dir): calculation_logs = glob.glob(os.path.join(log_dir, 'log.*')) for cl in calculation_logs: os.remove(cl) # Open a log file and run jsawrapdr while saving output to log. with open_log_file(job_id, 'jsawrapdr') as log: # Save the log file name. log_name = log.name # Run jsawrapdr. retcode = subprocess.call(jsawrapdrcom, env=jsa_env, bufsize=1, stdout=log, stderr=subprocess.STDOUT, preexec_fn=restore_signals) # Handle jsawrapdr errors. if retcode != 0: errormessage = 'jsawrapdr exited with Retcode %i ' % (retcode) # Find the first ORAC error message in the jsawrapdr log. jsalogfile = open(log_name, 'r') lines = jsalogfile.read() jsalogfile.close() result = re.search(r'.*(STDERR:\s*.*)$', lines, re.DOTALL) if result: firsterror = result.group(1).split('\n')[1] # Insert the ORAC error at the start of the error message. if firsterror: errormessage = 'ORAC ERROR: ' + firsterror + '.\n' + \ errormessage # Raise the error. raise JSAProcError(errormessage) return log_name