def run_post_steps(filelist, config, fmobj): """ Performs steps necessary for each file """ #print config.keys() firstname = miscutils.parse_fullname(filelist[0], miscutils.CU_PARSE_FILENAME) filetype = dtsutils.determine_filetype(firstname) miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", "filetype = %s" % filetype) # dynamically load class specific to filetype classkey = 'dts_filetype_class_' + filetype filetype_class = miscutils.dynamically_load_class(config[classkey]) valdict = fmutils.get_config_vals({}, config, filetype_class.requested_config_vals()) ftobj = filetype_class(dbh=fmobj, config=valdict) for fullname in filelist: filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", "filename = %s" % filename) if dtsutils.check_already_registered(filename, fmobj): ftobj.post_steps(fullname) # e.g., Rasicam # if success fmobj.commit() else: print "File must already be registered in order to run post_steps"
def load_filename_gtt(self, filelist): """ insert filenames into filename global temp table Parameters ---------- filelist : list List of strings of the file names, or of dictionaries describing the file names Returns ------- str The temp table name """ # returns filename GTT table name # make sure table is empty before loading it self.empty_gtt(dmdbdefs.DB_GTT_FILENAME) colmap = [dmdbdefs.DB_COL_FILENAME, dmdbdefs.DB_COL_COMPRESSION] rows = [] for _file in filelist: fname = None comp = None if isinstance(_file, str): (fname, comp) = miscutils.parse_fullname( _file, miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_EXTENSION) elif isinstance( _file, dict) and (dmdbdefs.DB_COL_FILENAME in _file or dmdbdefs.DB_COL_FILENAME.lower() in _file): if dmdbdefs.DB_COL_COMPRESSION in _file: fname = _file[dmdbdefs.DB_COL_FILENAME] comp = _file[dmdbdefs.DB_COL_COMPRESSION] elif dmdbdefs.DB_COL_COMPRESSION.lower() in _file: fname = _file[dmdbdefs.DB_COL_FILENAME.lower()] comp = _file[dmdbdefs.DB_COL_COMPRESSION.lower()] elif dmdbdefs.DB_COL_FILENAME in _file: (fname, comp) = miscutils.parse_fullname( _file[dmdbdefs.DB_COL_FILENAME], miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_EXTENSION) else: (fname, comp) = miscutils.parse_fullname( _file[dmdbdefs.DB_COL_FILENAME.lower()], miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_EXTENSION) else: raise ValueError(f"Invalid entry filelist({_file})") rows.append({ dmdbdefs.DB_COL_FILENAME: fname, dmdbdefs.DB_COL_COMPRESSION: comp }) self.insert_many(dmdbdefs.DB_GTT_FILENAME, colmap, rows) return dmdbdefs.DB_GTT_FILENAME
def write_outputwcl(self, outfilename=None): """ Write output wcl to file Parameters ---------- outfilename : str, optional The anem of the output wcl file to write. Default is ``None`` which indicates that the file name is stored in the inputwcl. """ if outfilename is None: outfilename = self.inputwcl['wrapper']['outputwcl'] if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print("outfilename = %s" % outfilename, WRAPPER_OUTPUT_PREFIX) # create output wcl directory if needed outwcldir = miscutils.parse_fullname(outfilename, miscutils.CU_PARSE_PATH) if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print("outwcldir = %s" % outwcldir, WRAPPER_OUTPUT_PREFIX) miscutils.coremakedirs(outwcldir) with open(outfilename, 'w') as wclfh: self.outputwcl.write(wclfh, True)
def _gather_metadata_from_config(self, fullname, metakeys): """ Get values from config Parameters ---------- fullname : str The name of the file to gather data about metakeys : list List of keys to look for Returns ------- dict The metadata """ metadata = OrderedDict() for wclkey in metakeys: metakey = wclkey.split('.')[-1] if metakey == 'fullname': metadata['fullname'] = fullname elif metakey == 'filename': metadata['filename'] = miscutils.parse_fullname( fullname, miscutils.CU_PARSE_FILENAME) elif metakey == 'filetype': metadata['filetype'] = self.filetype else: #if miscutils.fwdebug_check(6, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: wclkey=%s" % (wclkey)) (exists, val) = self.config.search(wclkey) if exists: metadata[metakey] = val return metadata
def get_single_file_disk_info(fname, save_md5sum=False, archive_root=None): """ Method to get disk info for a single file """ if miscutils.fwdebug_check(3, "DISK_UTILS_LOCAL_DEBUG"): miscutils.fwdebug_print(f"fname={fname}, save_md5sum={save_md5sum}, archive_root={archive_root}") parsemask = miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_COMPRESSION (path, filename, compress) = miscutils.parse_fullname(fname, parsemask) if miscutils.fwdebug_check(3, "DISK_UTILS_LOCAL_DEBUG"): miscutils.fwdebug_print(f"path={path}, filename={filename}, compress={compress}") fdict = {'filename' : filename, 'compression': compress, 'path': path, 'filesize': os.path.getsize(fname) } if save_md5sum: fdict['md5sum'] = get_md5sum_file(fname) if archive_root and path.startswith('/'): fdict['relpath'] = path[len(archive_root)+1:] if compress is None: compext = "" else: compext = compress fdict['rel_filename'] = f"{fdict['relpath']}/{filename}{compext}" return fdict
def ingest_contents(self, listfullnames, **kwargs): """ Ingest data into non-metadata table - rasicam_decam""" assert isinstance(listfullnames, list) dbtable = 'rasicam_decam' for fullname in listfullnames: if not os.path.isfile(fullname): raise OSError(f"Exposure file not found: '{fullname}'") filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) primary_hdr = None if 'prihdr' in kwargs: primary_hdr = kwargs['prihdr'] elif 'hdulist' in kwargs: hdulist = kwargs['hdulist'] primary_hdr = hdulist[0].header else: primary_hdr = fits.getheader(fullname, 0) row = get_vals_from_header(primary_hdr) row['filename'] = filename row['source'] = 'HEADER' row['analyst'] = 'DTS.ingest' if row: self.dbh.basic_insert_row(dbtable, row) else: raise Exception( f"No RASICAM header keywords identified for {filename}")
def generate_provenance(fullname): """ Generate provenance wcl """ (fname, compression) = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_EXTENSION) if compression is not None: fname += compression prov = {'was_generated_by': {'exec_1': fname}} # includes compression extension return prov
def has_contents_ingested(self, listfullnames): """ Check if file has contents ingested """ assert isinstance(listfullnames, list) # assume uncompressed and compressed files have same metadata # choosing either doesn't matter byfilename = {} for fname in listfullnames: filename = miscutils.parse_fullname(fname, miscutils.CU_PARSE_FILENAME) byfilename[filename] = fname #self.dbh.empty_gtt(dmdbdefs.DB_GTT_FILENAME) self.dbh.load_filename_gtt(list(byfilename.keys())) dbq = f"select m.manifest_filename from MANIFEST_EXPOSURE m, {dmdbdefs.DB_GTT_FILENAME} g where m.manifest_filename=g.filename" curs = self.dbh.cursor() curs.execute(dbq) results = {} for row in curs: results[byfilename[row[0]]] = True for fname in listfullnames: if fname not in results: results[fname] = False #self.dbh.empty_gtt(dmdbdefs.DB_GTT_FILENAME) return results
def read_fullnames_from_listfile(listfile, linefmt, colstr): """ Read a list file returning fullnames from the list """ if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('colstr=%s' % colstr) columns = convert_col_string_to_list(colstr, False) if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('columns=%s' % columns) fullnames = {} pos2fsect = {} for pos in range(0, len(columns)): lcol = columns[pos].lower() if lcol.endswith('.fullname'): filesect = lcol[:-9] pos2fsect[pos] = filesect fullnames[filesect] = [] # else a data column instead of a filename if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('pos2fsect=%s' % pos2fsect) if linefmt == 'config' or linefmt == 'wcl': miscutils.fwdie( 'Error: wcl list format not currently supported (%s)' % listfile, 1) else: with open(listfile, 'r') as listfh: for line in listfh: line = line.strip() # convert line into python list lineinfo = [] if linefmt == 'textcsv': lineinfo = miscutils.fwsplit(line, ',') elif linefmt == 'texttab': lineinfo = miscutils.fwsplit(line, '\t') elif linefmt == 'textsp': lineinfo = miscutils.fwsplit(line, ' ') else: miscutils.fwdie('Error: unknown linefmt (%s)' % linefmt, 1) # save each fullname in line for pos in pos2fsect: # use common routine to parse actual fullname (e.g., remove [0]) parsemask = miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME | \ miscutils.CU_PARSE_COMPRESSION (path, filename, compression) = miscutils.parse_fullname( lineinfo[pos], parsemask) fname = "%s/%s" % (path, filename) if compression is not None: fname += compression fullnames[pos2fsect[pos]].append(fname) if miscutils.fwdebug_check(6, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('fullnames = %s' % fullnames) return fullnames
def get_file_archive_info(self, filelist, arname, compress_order=fmdefs.FM_PREFER_COMPRESSED): # sanity checks if 'archive' not in self.config: miscutils.fwdie('Error: Missing archive section in config', 1) if arname not in self.config['archive']: miscutils.fwdie(f'Error: Invalid archive name ({arname})', 1) if 'root' not in self.config['archive'][arname]: miscutils.fwdie( f"Error: Missing root in archive def ({self.config['archive'][arname]})", 1) if not isinstance(compress_order, list): miscutils.fwdie( 'Error: Invalid compress_order. It must be a list of compression extensions (including None)', 1) # walk archive to get all files fullnames = {} for p in compress_order: fullnames[p] = {} root = self.config['archive'][arname]['root'] root = root.rstrip("/") # canonicalize - remove trailing / to ensure for (dirpath, _, filenames) in os.walk(root, followlinks=True): for fname in filenames: d = {} (d['filename'], d['compression']) = miscutils.parse_fullname(fname, 3) d['filesize'] = os.path.getsize(f"{dirpath}/{fname}") d['path'] = dirpath[len(root) + 1:] if d['compression'] is None: compext = "" else: compext = d['compression'] d['rel_filename'] = f"{d['path']}/{d['filename']}{compext}" fullnames[d['compression']][d['filename']] = d print("uncompressed:", len(fullnames[None])) print("compressed:", len(fullnames['.fz'])) # go through given list of filenames and find archive location and compreesion archiveinfo = {} for name in filelist: #print name for p in compress_order: # follow compression preference #print "p = ", p if name in fullnames[p]: archiveinfo[name] = fullnames[p][name] break print("archiveinfo = ", archiveinfo) return archiveinfo
def get_metadata(self, fullname): ftype = 'snmanifest' filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) filemeta = {'file_1': {'filename': filename, 'filetype':ftype}} self.filemeta = filemeta['file_1'] return self.filemeta
def insert_rasicam(self, fullname): DBtable='rasicam_decam' # Keyword list needed to update the database. # i=int, f=float, b=bool, s=str, date=date keylist = { 'EXPNUM':'i', 'INSTRUME':'s', 'SKYSTAT':'b', 'SKYUPDAT':'date', 'GSKYPHOT':'b', 'LSKYPHOT':'b', 'GSKYVAR':'f', 'GSKYHOT':'f', 'LSKYVAR':'f', 'LSKYHOT':'f', 'LSKYPOW':'f' } if (not(os.path.isfile(fullname))): raise Exception("Exposure not found: '%s'" % fullname) filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) row = {} row['filename'] = filename row['source'] = 'HEADER' row['analyst'] = 'DTS.ingest' hdulist = pyfits.open(fullname) primary_hdr = hdulist[0].header numkey_found = 0 for key, ktype in keylist.items(): if (key.upper() in primary_hdr): numkey_found += 1 value = primary_hdr[key] #print primary_hdr[key] if (key == 'SKYUPDAT'): # entry_time is time exposure taken row['ENTRY_TIME'] = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S") elif (key == 'INSTRUME'): row['CAMSYM'] = wrapfuncs.func_camsym(fullname) elif (ktype == 'b'): if (value): row[key] = 'T' else: row[key] = 'F' elif (ktype == 'i'): if value != 'NaN': row[key] = int(value) else: if value != 'NaN': row[key] = float(value) #print "row = %s" % row if (numkey_found > 0): self.dbh.basic_insert_row(DBtable, row) else: raise Exception("No RASICAM header keywords identified for %s" % filename)
def get_metadata(self, fullname): ftype = 'snmanifest' filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) filemeta = {'file_1': {'filename': filename, 'filetype': ftype}} self.filemeta = filemeta['file_1'] return self.filemeta
def has_contents_ingested(self, listfullnames): """ Check if file has contents ingested """ #starttime = time.time() assert isinstance(listfullnames, list) results = {} for fname in listfullnames: filename = miscutils.parse_fullname(fname, miscutils.CU_PARSE_FILENAME) results[fname] = dfiutils.is_ingested(filename, self.tablename, self.dbh) return results
def get_single_file_disk_info(fname, save_md5sum=False, archive_root=None): """ Method to get disk info for a single file Parameters ---------- fname : str The name of the file save_md5sum : bool Whether to calculate the md5sum (True) or no (False), default is False archive_root : str The archive root path to prepend to the output data, default is None """ if miscutils.fwdebug_check(3, "DISK_UTILS_LOCAL_DEBUG"): miscutils.fwdebug_print("fname=%s, save_md5sum=%s, archive_root=%s" % \ (fname, save_md5sum, archive_root)) parsemask = miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_COMPRESSION (path, filename, compress) = miscutils.parse_fullname(fname, parsemask) if miscutils.fwdebug_check(3, "DISK_UTILS_LOCAL_DEBUG"): miscutils.fwdebug_print("path=%s, filename=%s, compress=%s" % (path, filename, compress)) fdict = { 'filename': filename, 'compression': compress, 'path': path, 'filesize': os.path.getsize(fname) } if save_md5sum: fdict['md5sum'] = get_md5sum_file(fname) if archive_root and path.startswith('/'): fdict['relpath'] = path[len(archive_root) + 1:] if compress is None: compext = "" else: compext = compress fdict['rel_filename'] = "%s/%s%s" % (fdict['relpath'], filename, compext) return fdict
def check_single_valid(keywords, fullname, verbose): # should raise exception if not valid """ Check whether the given file is a valid raw file """ # check fits file hdulist = fits.open(fullname) prihdr = hdulist[0].header # check exposure has correct filename (sometimes get NOAO-science-archive renamed exposures) correct_filename = prihdr['FILENAME'] actual_filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) if actual_filename != correct_filename: raise ValueError(f'Error: invalid filename ({actual_filename})') instrume = prihdr['INSTRUME'].lower() req_num_hdus = -1 if instrume == 'decam': req_num_hdus = 71 else: raise ValueError(f'Error: Unknown instrume ({instrume})') # check # hdus num_hdus = len(hdulist) if num_hdus != req_num_hdus: raise ValueError(f'Error: Invalid number of hdus ({num_hdus})') # check keywords for hdunum in range(0, num_hdus): hdr = hdulist[hdunum].header (req, want, extra) = check_header_keywords(keywords, hdunum, hdr) if verbose > 1: if want is not None and want: print(f"HDU #{hdunum:02d} Missing requested keywords: {want}") if extra is not None and extra: print(f"HDU #{hdunum:02d} Extra keywords: {extra}") if req is not None and req: raise ValueError( f'Error: HDU #{hdunum:02d} Missing required keywords ({req})') return True
def datafile_ingest_main(dbh, filetype, fullname, tablename, didatadefs): """ Control process for ingesting data from a file """ #sections_wanted = get_sections_for_filetype(filetype, dbh) sections_wanted = list(didatadefs.keys()) if 'xml' in filetype: datadict = Xmlslurper(fullname, sections_wanted).gettables() else: if len(sections_wanted) > 1: raise ValueError("Multiple hdus not yet supported\n") datadict = get_fits_data(fullname, sections_wanted[0]) filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) numrows = ingest_datafile_contents(filename, filetype, tablename, didatadefs, datadict, dbh) return numrows
def has_metadata_ingested(self, listfullnames): """ Check if file has row in metadata table """ assert isinstance(listfullnames, list) # assume uncompressed and compressed files have same metadata # choosing either doesn't matter byfilename = {} for fname in listfullnames: filename = miscutils.parse_fullname(fname, miscutils.CU_PARSE_FILENAME) byfilename[filename] = fname #self.dbh.empty_gtt(dmdbdefs.DB_GTT_FILENAME) if miscutils.fwdebug_check(3, 'FTMGMT_DEBUG'): miscutils.fwdebug_print( f"Loading filename_gtt with: {list(byfilename.keys())}") self.dbh.load_filename_gtt(list(byfilename.keys())) metadata_table = self.config['filetype_metadata'][ self.filetype]['metadata_table'] if metadata_table.lower() == 'genfile': metadata_table = 'desfile' dbq = f"select m.filename from {metadata_table} m, {dmdbdefs.DB_GTT_FILENAME} g where m.filename=g.filename" curs = self.dbh.cursor() if miscutils.fwdebug_check(3, 'FTMGMT_DEBUG'): miscutils.fwdebug_print(f"Metadata check query: {dbq}") curs.execute(dbq) results = {} for row in curs: results[byfilename[row[0]]] = True for fname in listfullnames: if fname not in results: results[fname] = False if miscutils.fwdebug_check(3, 'FTMGMT_DEBUG'): miscutils.fwdebug_print(f"Metadata check results: {results}") return results
def write_outputwcl(self, outfilename=None): """ Write output wcl to file """ if outfilename is None: outfilename = self.inputwcl['wrapper']['outputwcl'] if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"outfilename = {outfilename}", WRAPPER_OUTPUT_PREFIX) # create output wcl directory if needed outwcldir = miscutils.parse_fullname(outfilename, miscutils.CU_PARSE_PATH) if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"outwcldir = {outwcldir}", WRAPPER_OUTPUT_PREFIX) miscutils.coremakedirs(outwcldir) with open(outfilename, 'w') as wclfh: self.outputwcl.write(wclfh, True)
def _gather_metadata_from_config(self, fullname, metakeys): """ Get values from config """ metadata = collections.OrderedDict() for wclkey in metakeys: metakey = wclkey.split('.')[-1] if metakey == 'fullname': metadata['fullname'] = fullname elif metakey == 'filename': metadata['filename'] = miscutils.parse_fullname( fullname, miscutils.CU_PARSE_FILENAME) elif metakey == 'filetype': metadata['filetype'] = self.filetype else: if miscutils.fwdebug_check(6, 'FTMGMT_DEBUG'): miscutils.fwdebug_print(f"INFO: wclkey={wclkey}") (exists, val) = self.config.search(wclkey) if exists: metadata[metakey] = val return metadata
def list_missing_archive(filemgmt, filelist, archive_name): """ Return list of files from given list which are not listed in archive """ print("\tChecking which files are already registered in archive", flush=True) starttime = time.time() existing = filemgmt.is_file_in_archive(filelist, archive_name) endtime = time.time() print(f"({endtime - starttime:0.2f} secs)", flush=True) filenames = {} for fullname in filelist: fname = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_BASENAME) filenames[fname] = fullname missing_basenames = set(filenames.keys()) - set(existing) misslist = [filenames[f] for f in missing_basenames] print(f"\t\t{len(existing):0d} file(s) already in archive", flush=True) print(f"\t\t{len(misslist):0d} file(s) still to be registered to archive", flush=True) return misslist
def get_file_disk_info(self, filelist, endpoint): """ get info on the files Parameters ---------- filelist : list List of files to look at endpoint : str The endpoint to use Returns ------- dict of the results """ # endpoint_ls currently only does ls for directories not single files # determine directories for which to get listing pathlist = {} filebypath = {} for fname in filelist: (path, _) = miscutils.parse_fullname( fname, miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME) pathlist[path] = True if path not in filebypath: filebypath[path] = {} filebypath[path][fname] = True # get directory listing from endpoint diskinfo = {} _ = self.endpoint_activate(endpoint) for path in pathlist.keys(): dirlist = self.get_directory_listing(path, endpoint, False) for fullname, finfo in dirlist.items(): if fullname in filebypath[path]: diskinfo[fullname] = finfo return diskinfo
def handle_file(notify_file, delivery_fullname, config, filemgmt, task_id): """ Performs steps necessary for each file """ filetype = None metadata = None disk_info = None prov = None # read values from notify file notifydict = read_notify_file(notify_file) # use dts_md5sum from notify_file dts_md5sum = None if 'md5sum' in notifydict: dts_md5sum = notifydict['md5sum'] print "%s: dts md5sum = %s" % (delivery_fullname, dts_md5sum) #print config.keys() try: filename = miscutils.parse_fullname(delivery_fullname, miscutils.CU_PARSE_FILENAME) miscutils.fwdebug(0, "DTSFILEHANDLER_DEBUG", "filename = %s" % filename) if not os.path.exists(delivery_fullname): print "Warning: delivered file does not exist:" print "\tnotification file: %s" % notify_file print "\tdelivered file: %s" % delivery_fullname print "\tRemoving notification file and continuing" os.unlink(notify_file) return if dts_md5sum is not None: starttime = datetime.now() fileinfo_before_move = diskutils.get_single_file_disk_info(deliver_fullname, True, None) endtime = datetime.now() print "%s: md5sum before move %s (%0.2f secs)" % (delivery_fullname, fileinfo_before_move['md5sum'], endtime-starttime) if fileinfo_before_move['md5sum'] != dts_md5sum: print "%s: dts md5sum = %s" % (delivery_fullname, dts_md5sum) print "%s: py md5sum = %s" % (delivery_fullname, fileinfo_before_move['md5sum']) raise Exception("Error: md5sum in delivery dir not the same as DTS-provided md5sum") if not dtsutils.check_already_registered(filename, filemgmt): filetype = dtsutils.determine_filetype(filename) miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", "filetype = %s" % filetype) # dynamically load class specific to filetype classkey = 'dts_filetype_class_' + filetype filetype_class = miscutils.dynamically_load_class(config[classkey]) valDict = fmutils.get_config_vals({}, config, filetype_class.requested_config_vals()) filetypeObj = filetype_class(dbh=filemgmt, config=valDict) metadata = filetypeObj.get_metadata(delivery_fullname) metadata['filename'] = filename metadata['filetype'] = filetype miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", 'len(metadata) = %s' % len(metadata)) miscutils.fwdebug(6, "DTSFILEHANDLER_DEBUG", 'metadata = %s' % metadata) filetypeObj.check_valid(delivery_fullname) # should raise exception if not valid prov = generate_provenance(delivery_fullname) miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", 'archive_rel_path = %s' % archive_rel_path) miscutils.fwdebug(3, "DTSFILEHANDLER_DEBUG", 'prov = %s' % prov) archive_rel_path = filetypeObj.get_archive_path(delivery_fullname) disk_info = move_file_to_archive(config, delivery_fullname, archive_rel_path, dts_md5sum) save_data_db(filemgmt, task_id, {'file_1': metadata}, disk_info, prov) filetypeObj.post_steps(disk_info['fullname']) # e.g., Rasicam # if success filemgmt.commit() os.unlink(notify_file) else: handle_bad_file(config, notify_file, delivery_fullname, filemgmt, filetype, metadata, disk_info, prov, "already registered") except Exception as err: (type, value, trback) = sys.exc_info() print "******************************" print "Error: %s" % delivery_fullname traceback.print_exception(type, value, trback, file=sys.stdout) print "******************************" handle_bad_file(config, notify_file, delivery_fullname, filemgmt, filetype, metadata, disk_info, prov, "Exception: %s" % err) except SystemExit: # Wrappers code calls exit if cannot find header value handle_bad_file(config, notify_file, delivery_fullname, filemgmt, filetype, metadata, disk_info, prov, "SystemExit: Probably missing header value. Check log for error msg.") filemgmt.commit()
def _gather_metadata_from_filename(self, fullname, metakeys): """ Parse filename using given filepat Parameters ---------- fullname : str The name of the file to gather data about metakeys : list List of keys to look for Returns ------- dict The metadata """ if self.filepat is None: raise TypeError("None filepat for filetype %s" % self.filetype) # change wcl file pattern into a pattern usable by re newfilepat = copy.deepcopy(self.filepat) varpat = r"\$\{([^$}]+:\d+)\}|\$\{([^$}]+)\}" listvar = [] m = re.search(varpat, newfilepat) while m: #print m.group(1), m.group(2) if m.group(1) is not None: m2 = re.search(r'([^:]+):(\d+)', m.group(1)) #print m2.group(1), m2.group(2) listvar.append(m2.group(1)) # create a pattern that will remove the 0-padding newfilepat = re.sub(r"\${%s}" % (m.group(1)), r'(\d{%s})' % m2.group(2), newfilepat) else: newfilepat = re.sub(r"\${%s}" % (m.group(2)), r'(\S+)', newfilepat) listvar.append(m.group(2)) m = re.search(varpat, newfilepat) # now that have re pattern, parse the filename for values filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) #if miscutils.fwdebug_check(3, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: newfilepat = %s" % newfilepat) # miscutils.fwdebug_print("INFO: filename = %s" % filename) m = re.search(newfilepat, filename) if m is None: miscutils.fwdebug_print("INFO: newfilepat = %s" % newfilepat) miscutils.fwdebug_print("INFO: filename = %s" % filename) raise ValueError("Pattern (%s) did not match filename (%s)" % (newfilepat, filename)) #if miscutils.fwdebug_check(3, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: m.group() = %s" % m.group()) # miscutils.fwdebug_print("INFO: listvar = %s" % listvar) # only save values parsed from filename that were requested per metakeys mddict = {} for cnt, key in enumerate(listvar): if key in metakeys: #if miscutils.fwdebug_check(6, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: saving as metadata key = %s, cnt = %s" % (key, cnt)) mddict[key] = m.group(cnt + 1) #elif miscutils.fwdebug_check(6, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: skipping key = %s because not in metakeys" % key) #if miscutils.fwdebug_check(6, 'FTMGMT_DEBUG'): # miscutils.fwdebug_print("INFO: mddict = %s" % mddict) return mddict
def read_json_single(self, json_file,allMandatoryExposureKeys, debug): miscutils.fwdebug(3, 'DTSSNMANIFEST_DEBUG', "reading file %s" % json_file) allExposures = [] my_header = {} numseq = {} all_exposures = dict() with open(json_file) as my_json: for line in my_json: all_data = json.loads(line) for key, value in all_data.items(): errorFlag = 0 if key == 'header': #read the values for the header (date and set_type are here) my_head = value allExposures.append(str(my_head['set_type'])) allExposures.append(str(my_head['createdAt'])) if key == 'exposures': #read all the exposures that were taken for the set_type in header my_header = value #Total Number of exposures in manifest file tot_exposures = len(my_header) if tot_exposures is None or tot_exposures == 0: raise Exception("0 SN exposures parsed from json file") for i in range(tot_exposures): numseq = my_header[i]['sequence'] mytime = my_header[i]['acttime'] if mytime > 10 and numseq['seqnum'] == 2: first_expnum = my_header[i]['expid'] #Validate if acctime has a meaningful value. If acttime = 0.0, then it's a bad exposure. Skip it from the manifest. if mytime == 0.0: continue try: for mandatoryExposureKey in (allMandatoryExposureKeys): miscutils.fwdebug(3, 'DTSSNMANIFEST_DEBUG', "mandatory key %s" % mandatoryExposureKey) key = str(mandatoryExposureKey) if my_header[i][mandatoryExposureKey]: miscutils.fwdebug(3, 'DTSSNMANIFEST_DEBUG', "mandatory key '%s' found %s" % (mandatoryExposureKey, my_header[i][mandatoryExposureKey])) miscutils.fwdebug(6, 'DTSSNMANIFEST_DEBUG', "allExposures in for: %s" % allExposures) try: if key == 'acttime': key = 'EXPTIME' all_exposures[key].append(my_header[i][mandatoryExposureKey]) elif key == 'filter': key = 'BAND' all_exposures[key].append(str(my_header[i][mandatoryExposureKey])) elif key == 'expid': key = 'EXPNUM' all_exposures[key].append(my_header[i][mandatoryExposureKey]) else: all_exposures[key].append(my_header[i][mandatoryExposureKey]) except KeyError: all_exposures[key] = [my_header[i][mandatoryExposureKey]] except KeyError: miscutils.fwdebug(0, 'DTSSNMANIFEST_DEBUG', "keyError: missing key %s in json entity: %s " % (mandatoryExposureKey,line)) errorFlag = 1 raise timestamp = all_exposures['date'][0] nite = dtsutils.convert_UTCstr_to_nite(timestamp) # get field by parsing set_type #print 'xxxx', my_head['set_type'] myfield = my_head['set_type'] if len(myfield) > 5: newfield = myfield[:5] else: newfield = myfield camsym = 'D' # no way to currently tell CAMSYM/INSTRUME from manifest file if not newfield.startswith('SN-'): raise ValueError("Invalid field (%s). set_type = '%s'" % (newfield, my_head['set_type'])) #if json_file contains a path or compression extension, then cut it to only the filename jsonFile = miscutils.parse_fullname(json_file, miscutils.CU_PARSE_FILENAME) if tot_exposures is None or tot_exposures == 0: raise Exception("0 SN exposures parsed from json file") for i in range(tot_exposures): if my_header[i]['acttime'] == 0.0: continue if i == 0: #all_exposures['FIELD'] = [str(my_head['set_type'])] all_exposures['FIELD'] = [newfield] all_exposures['CREATEDAT'] = [str(my_head['createdAt'])] all_exposures['MANIFEST_FILENAME'] = [jsonFile] all_exposures['NITE'] = [nite] all_exposures['SEQNUM'] = [1] all_exposures['CAMSYM'] = [camsym] else: #all_exposures['FIELD'].append(str(my_head['set_type'])) all_exposures['FIELD'].append(newfield) all_exposures['CREATEDAT'].append(str(my_head['createdAt'])) all_exposures['MANIFEST_FILENAME'].append(jsonFile) all_exposures['NITE'].append(nite) all_exposures['SEQNUM'].append(1) all_exposures['CAMSYM'].append(camsym) # Add the manifest filename value in the dictionary #all_exposures['MANIFEST_FILENAME'] = json_file miscutils.fwdebug(6, 'DTSSNMANIFEST_DEBUG', "allExposures " % (all_exposures)) return all_exposures
def load_artifact_gtt(self, filelist): """ insert file artifact information into global temp table Parameters ---------- filelist : list List of dictionaries, one for each file, giving the file metadata to store. Returns ------- str The name of the temp table """ # filelist is list of file dictionaries # returns artifact GTT table name parsemask = miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_EXTENSION # make sure table is empty before loading it self.empty_gtt(dmdbdefs.DB_GTT_ARTIFACT) colmap = [ dmdbdefs.DB_COL_FILENAME, dmdbdefs.DB_COL_COMPRESSION, dmdbdefs.DB_COL_MD5SUM, dmdbdefs.DB_COL_FILESIZE ] rows = [] for _file in filelist: miscutils.fwdebug(3, 'DESDBI_DEBUG', f"file = {_file}") fname = None comp = None md5sum = None filesize = None if dmdbdefs.DB_COL_FILENAME in _file or dmdbdefs.DB_COL_FILENAME.lower( ) in _file: if dmdbdefs.DB_COL_COMPRESSION in _file: fname = _file[dmdbdefs.DB_COL_FILENAME] comp = _file[dmdbdefs.DB_COL_COMPRESSION] elif dmdbdefs.DB_COL_COMPRESSION.lower() in _file: fname = _file[dmdbdefs.DB_COL_FILENAME.lower()] comp = _file[dmdbdefs.DB_COL_COMPRESSION.lower()] elif dmdbdefs.DB_COL_FILENAME in _file: (fname, comp) = miscutils.parse_fullname( _file[dmdbdefs.DB_COL_FILENAME], parsemask) else: (fname, comp) = miscutils.parse_fullname( _file[dmdbdefs.DB_COL_FILENAME.lower()], parsemask) miscutils.fwdebug(3, 'DESDBI_DEBUG', f"fname={fname}, comp={comp}") elif 'fullname' in _file: (fname, comp) = miscutils.parse_fullname(_file['fullname'], parsemask) miscutils.fwdebug( 3, 'DESDBI_DEBUG', f"parse_fullname: fname={fname}, comp={comp}") else: miscutils.fwdebug(3, 'DESDBI_DEBUG', f"file={_file}") raise ValueError(f"Invalid entry filelist({_file})") if dmdbdefs.DB_COL_FILESIZE in _file: filesize = _file[dmdbdefs.DB_COL_FILESIZE] elif dmdbdefs.DB_COL_FILESIZE.lower() in _file: filesize = _file[dmdbdefs.DB_COL_FILESIZE.lower()] if dmdbdefs.DB_COL_MD5SUM in _file: md5sum = _file[dmdbdefs.DB_COL_MD5SUM] elif dmdbdefs.DB_COL_MD5SUM.lower() in _file: md5sum = _file[dmdbdefs.DB_COL_MD5SUM.lower()] miscutils.fwdebug( 3, 'DESDBI_DEBUG', f"row: fname={fname}, comp={comp}, filesize={filesize}, md5sum={md5sum}" ) rows.append({ dmdbdefs.DB_COL_FILENAME: fname, dmdbdefs.DB_COL_COMPRESSION: comp, dmdbdefs.DB_COL_FILESIZE: filesize, dmdbdefs.DB_COL_MD5SUM: md5sum }) self.insert_many(dmdbdefs.DB_GTT_ARTIFACT, colmap, rows) return dmdbdefs.DB_GTT_ARTIFACT
def register_file_in_archive(self, filelist, archive_name): """ Saves filesystem information about file like relative path in archive, compression extension, etc """ # assumes files have already been declared to database (i.e., metadata) # caller of program must have already verified given filelist matches given archive # if giving fullnames, must include archive root # keys to each file dict must be lowercase column names, missing data must be None #if miscutils.fwdebug_check(6, 'FILEMGMT_DEBUG'): # miscutils.fwdebug_print("filelist = %s" % filelist) archivedict = self.config['archive'][archive_name] archiveroot = archivedict['root'] origfilelist = filelist if isinstance(origfilelist, str): filelist = [origfilelist] if filelist: # get id from desfile table gtt_name = self.load_filename_gtt(filelist) idsql = f"""select d.filename, d.compression, d.id from desfile d, {gtt_name} g where d.filename=g.filename and nullcmp(d.compression, g.compression) = 1""" ids = {} curs = self.cursor() curs.execute(idsql) for row in curs: ids[row[0]] = {row[1]: row[2]} #self.empty_gtt(gtt_name) # create dict of info to insert into file_archive_info insfilelist = [] for onefile in filelist: nfiledict = {} nfiledict['archive_name'] = archive_name if isinstance(onefile, dict): if 'filename' in onefile and 'path' in onefile and 'compression' in onefile: nfiledict['filename'] = onefile['filename'] nfiledict['compression'] = onefile['compression'] path = onefile['path'] elif 'fullname' in onefile: parsemask = miscutils.CU_PARSE_PATH | \ miscutils.CU_PARSE_FILENAME | \ miscutils.CU_PARSE_COMPRESSION (path, nfiledict['filename'], nfiledict['compression']) = miscutils.parse_fullname(onefile['fullname'], parsemask) else: miscutils.fwdie(f"Error: Incomplete info for a file to register. Given {onefile}", 1) elif isinstance(onefile, str): # fullname parsemask = miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_COMPRESSION (path, nfiledict['filename'], nfiledict['compression']) = miscutils.parse_fullname(onefile, parsemask) # make sure compression starts with . if nfiledict['compression'] is not None and not re.match(r'^\.', nfiledict['compression']): nfiledict['compression'] = '.' + nfiledict['compression'] # get matching desfile id if nfiledict['filename'] in ids: if nfiledict['compression'] in ids[nfiledict['filename']]: nfiledict['desfile_id'] = int(ids[nfiledict['filename']][nfiledict['compression']]) else: raise ValueError(f'Missing desfile id for file - no matching compression ({onefile})') else: raise ValueError(f'Missing desfile id for file - no matching filename ({onefile})') if re.match(r'^/', path): # if path is absolute #if miscutils.fwdebug_check(3, 'FILEMGMT_DEBUG'): # miscutils.fwdebug_print("absolute path = %s" % path) # miscutils.fwdebug_print("archiveroot = %s/" % archiveroot) # get rid of the archive root from the path to store if re.match(fr'^{archiveroot}/', path): nfiledict['path'] = path[len(archiveroot) + 1:] else: canon_archroot = os.path.realpath(archiveroot) canon_path = os.path.realpath(path) # get rid of the archive root from the path to store if re.match(fr'^{canon_archroot}/', canon_path): nfiledict['path'] = canon_path[len(canon_archroot) + 1:] else: miscutils.fwdie((f"Error: file's absolute path ({path}) does not " + f"contain the archive root ({archiveroot}) (filedict:{nfiledict})"), 1) else: #if miscutils.fwdebug_check(3, 'FILEMGMT_DEBUG'): # miscutils.fwdebug_print("relative path = %s" % path) nfiledict['path'] = path # assume only contains the relative path within the archive insfilelist.append(nfiledict) colnames = ['desfile_id', 'filename', 'compression', 'path', 'archive_name'] try: self.insert_many_indiv('FILE_ARCHIVE_INFO', colnames, insfilelist) except: print("Error from insert_many_indiv in register_file_archive") print("colnames =", colnames) print("filelist =", insfilelist) raise
def save_provenance(self, execsect, exwcl, infiles, outfiles, exitcode): """ Create provenance wcl """ #pylint: disable=unbalanced-tuple-unpacking self.start_exec_task('save_provenance') if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print("INFO: Beg", WRAPPER_OUTPUT_PREFIX) if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"INFO: infiles = {infiles}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print(f"INFO: outfiles = {outfiles}", WRAPPER_OUTPUT_PREFIX) num_errs = 0 # convert probably fullnames in outexist to filename+compression new_outfiles = collections.OrderedDict() for exlabel, exlist in outfiles.items(): if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print( f"INFO: exlabel={exlabel} exlist={exlist}", WRAPPER_OUTPUT_PREFIX) newlist = [] for fullname in exlist: basename = miscutils.parse_fullname( fullname, miscutils.CU_PARSE_BASENAME) newlist.append(basename) if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"INFO: newlist={newlist}", WRAPPER_OUTPUT_PREFIX) new_outfiles[exlabel] = newlist prov = self.outputwcl[intgdefs.OW_PROV_SECT] # used new_infiles = {} if infiles: all_infiles = [] for key, sublist in infiles.items(): new_infiles[key] = [] for fullname in sublist: basename = miscutils.parse_fullname( fullname, miscutils.CU_PARSE_BASENAME) all_infiles.append(basename) new_infiles[key].append(basename) prov[provdefs.PROV_USED][execsect] = provdefs.PROV_DELIM.join( all_infiles) # was_generated_by - done by PFW when saving metadata # was_derived_from if intgdefs.IW_DERIVATION in exwcl: wdf = prov[provdefs.PROV_WDF] derived_pairs = miscutils.fwsplit(exwcl[intgdefs.IW_DERIVATION], provdefs.PROV_DELIM) for dpair in derived_pairs: if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"INFO: dpair = {dpair}", WRAPPER_OUTPUT_PREFIX) (parent_sect, child_sect) = miscutils.fwsplit(dpair, ':')[:2] if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print( f"INFO: parent_sect = {parent_sect}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print(f"INFO: child_sect = {child_sect}", WRAPPER_OUTPUT_PREFIX) optout = self.get_optout(child_sect) #parent_key = miscutils.fwsplit(parent_sect, '.')[-1] #child_key = miscutils.fwsplit(child_sect, '.')[-1] if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): #miscutils.fwdebug_print("INFO: parent_key = %s" % parent_key, # WRAPPER_OUTPUT_PREFIX) #miscutils.fwdebug_print("INFO: child_key = %s" % child_key, # WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print(f"INFO: optout = {optout}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"INFO: new_outfiles.keys = {list(new_outfiles.keys())}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"INFO: new_outfiles = {new_outfiles}", WRAPPER_OUTPUT_PREFIX) if child_sect not in new_outfiles or \ new_outfiles[child_sect] is None or \ not new_outfiles[child_sect]: if optout: if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print( f"INFO: skipping missing optional output {parent_sect}:{child_sect}", WRAPPER_OUTPUT_PREFIX) elif exitcode != 0: if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print( f"INFO: skipping missing output due to non-zero exit code {parent_sect}:{child_sect}", WRAPPER_OUTPUT_PREFIX) else: miscutils.fwdebug_print( f"ERROR: Missing child output files in wdf tuple ({parent_sect}:{child_sect})", WRAPPER_OUTPUT_PREFIX) num_errs += 1 else: self.last_num_derived += 1 key = 'derived_%d' % self.last_num_derived if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"INFO: key = {key}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"INFO: before wdf = {prov[provdefs.PROV_WDF]}", WRAPPER_OUTPUT_PREFIX) if parent_sect not in infiles and parent_sect not in new_outfiles: miscutils.fwdebug_print(f"parent_sect = {parent_sect}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"infiles.keys() = {list(infiles.keys())}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"outfiles.keys() = {list(outfiles.keys())}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"used = {exwcl[intgdefs.IW_INPUTS]}", WRAPPER_OUTPUT_PREFIX) miscutils.fwdebug_print( f"ERROR: Could not find parent files for {dpair}", WRAPPER_OUTPUT_PREFIX) num_errs += 1 else: wdf[key] = collections.OrderedDict() wdf[key][ provdefs.PROV_CHILDREN] = provdefs.PROV_DELIM.join( new_outfiles[child_sect]) if parent_sect in infiles: wdf[key][provdefs. PROV_PARENTS] = provdefs.PROV_DELIM.join( new_infiles[parent_sect]) elif parent_sect in new_outfiles: # this output was generated within same # program/wrapper from other output files parents = [] for outparent in outfiles[parent_sect]: parents.append( miscutils.parse_fullname( outparent, miscutils.CU_PARSE_FILENAME)) wdf[key][provdefs. PROV_PARENTS] = provdefs.PROV_DELIM.join( parents) if miscutils.fwdebug_check(6, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print( f"INFO: after wdf = {prov[provdefs.PROV_WDF]}", WRAPPER_OUTPUT_PREFIX) if not wdf: del prov[provdefs.PROV_WDF] if miscutils.fwdebug_check(3, 'BASICWRAP_DEBUG'): miscutils.fwdebug_print(f"INFO: End (num_errs = {num_errs:d})", WRAPPER_OUTPUT_PREFIX) self.end_exec_task(num_errs) return prov
def restore_files(util, args, data): """ Method to restore file to the file system Parameters ---------- util : Util instance args : dict Command line arguments data : dict Data on the files to restore """ tape_tar = tarfile.open(args['tape'], mode='r') names = tape_tar.getnames() if args['unit'] not in names: raise Exception('Unit tar %s not found in tape tar %s, this should not happen' % (args['unit'], args['tape'])) unit_tar = tarfile.open(fileobj=tape_tar.extractfile(args['unit'])) root_path = '.' if args['restore']: root_path = data['archive'] else: args['update_fai'] = False if args['filename'] or args['path']: if args['filename']: regex = re.compile(r'%s\Z' % (args['filename'])) else: regex = re.compile(r'\A%s' % (args['path'])) allnames = [m for m in unit_tar.getnames() if regex.search(m)] unit_tar.extractall(path=root_path, members=[m for m in unit_tar.getmembers() if regex.search(m.name)]) else: unit_tar.extractall(path=root_path) allnames = unit_tar.getnames() if args['update_fai']: # get only the file names files = [m for m in allnames if unit_tar.getmember(m).isfile()] full_listing = {} for fln in files: full_filename = fln.split('/')[-1] direct = fln.replace('/' + full_filename, '') (filename, compression) = miscutils.parse_fullname(full_filename, miscutils.CU_PARSE_FILENAME | miscutils.CU_PARSE_COMPRESSION) full_listing[full_filename] = {'filename': filename, 'compression': compression, 'path': direct, 'desfile_id': None, 'archive': args['archive']} gtt = util.load_gtt_filename(full_listing.values()) cur = util.cursor() cur.execute('select df.id, df.filename, df.compression from desfile df, %s gtt where gtt.filename=df.filename and gtt.compression=df.compression' % (gtt)) results = cur.fetchall() desfile_ids = [] for res in results: full_listing[res[1] + res[2]]['desfile_id'] = res[0] desfile_ids.append(res[0]) # find any files not resgistered in desfile bad_files = {} if len(desfile_ids) != len(full_listing): for key, value in full_listing.iteritems(): if not value['desfile_id']: bad_files[key] = value full_listing = {key:full_listing[key] for key in full_listing if key not in bad_files.keys()} #gttid = util.conn.load_id_gtt(desfile_ids) # get files which are alread in file_archive_info #cur.execute('select desfile_id from file_archive_info fai, %s gtt where gtt.id=fai.desfile_id' % (gttid)) #results = cur.fetchall() #loaded_ids = [] #for res in results: # loaded_ids.append(res[0]) cur.prepare("merge into file_archive_info fai using dual on (fai.desfile_id=:desfile_id) when matched then update set path=:path,archive_name=:archive when not matched then insert (filename, archive_name, path, compression, desfile_id) values (:filename, :archive, :path, :compression, :desfile_id)") cur.executemany(None, full_listing) util.commit() if bad_files: print "WARNING: The following files we not added to FILE_ARCHIVE_INFO because they do not have entries" print "in DESFILE. They will need to be manually ingested with register_files.py" for key, value in bad_files.iteritems(): print os.path.join(value['path'], key) print '' # DO CHECK if args['verify'] and args['archive']: print "Starting integrity check of files..." comp_args = {'dbh': util, 'des_services': args['des_services'], 'section': args['section'], 'archive': args['archive'], 'md5sum': True, 'verbose': args['verbose'], 'silent': False} if args['pfwid']: comp_args['pfwid'] = args['pfwid'] elif args['reqnum']: comp_args['reqnum'] = args['reqnum'] comp_args['unitname'] = args['unitname'] comp_args['attnum'] = args['attnum'] else: if args['filename']: fullpath = allnames[0] args['path'] = fullpath[:fullpath.rfind('/')] comp_args['relpath'] = args['path'] cu.compare(**comp_args)
def gather_metadata_file(hdulist, fullname, metadata_defs, extra_info): """ gather metadata for a single file """ # extra_info is "dict" containing any info needed for wcl metadata # computes are already created and stored in hdu, key in headers list if hdulist is None: hdulist = fits.open(fullname, 'r') elif fullname is None: fullname = hdulist.filename() if miscutils.fwdebug_check(3, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print(f"INFO: Beg file={fullname}") if miscutils.fwdebug_check(6, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print(f"INFO: metadata_defs={metadata_defs}") miscutils.fwdebug_print(f"INFO: extra_info={extra_info}") metadata = {'fullname': fullname} if 'wcl' in metadata_defs: if miscutils.fwdebug_check(6, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print(f"INFO: wcl={metadata_defs['wcl']}") wcllist = None if isinstance(metadata_defs['wcl'], str): wcllist = miscutils.fwsplit(metadata_defs['wcl'], ',') else: wcllist = metadata_defs['wcl'] for wclkey in wcllist: metakey = wclkey.split('.')[-1] if metakey == 'fullname': metadata['fullname'] = fullname elif metakey == 'filename': metadata['filename'] = miscutils.parse_fullname( fullname, miscutils.CU_PARSE_FILENAME) else: if miscutils.fwdebug_check(3, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print(f"INFO: wclkey={wclkey}") metadata[metakey] = extra_info[wclkey] if 'headers' in metadata_defs: if miscutils.fwdebug_check(3, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print( f"INFO: headers={metadata_defs['headers']}") for hdu, keys in metadata_defs['headers'].items(): if miscutils.fwdebug_check(6, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print(f"INFO: hdu={hdu}, keys={keys}") keylist = None if isinstance(metadata_defs['wcl'], str): keylist = miscutils.fwsplit(keys, ',') else: keylist = keys for key in keylist: try: metadata[key] = fitsutils.get_hdr_value( hdulist, key.upper(), hdu) except KeyError: if miscutils.fwdebug_check(3, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print( f"INFO: didn't find key {key} in {hdu} header of file {fullname}" ) if miscutils.fwdebug_check(3, 'FM_METAUTILS_DEBUG'): miscutils.fwdebug_print("INFO: end") return metadata
def read_fullnames_from_listfile(listfile, linefmt, colstr): """ Read a list file returning fullnames from the list Parameters ---------- listfile : str The file to read linefmt : str The format of the lines. Acceptable formats are * 'textcsv' - a csv style file * 'texttab' - a tab separated style file * 'testsp' - a space separated style file colstr : str A string representation of the column headers. Returns ------- dict Dictionary of the file full names and general info. """ if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('colstr=%s' % colstr) columns = convert_col_string_to_list(colstr, False) if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('columns=%s' % columns) fullnames = {} pos2fsect = {} for pos, col in enumerate(columns): lcol = col.lower() if lcol.endswith('.fullname'): filesect = lcol[:-9] pos2fsect[pos] = filesect fullnames[filesect] = [] # else a data column instead of a filename if miscutils.fwdebug_check(3, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('pos2fsect=%s' % pos2fsect) if linefmt == 'config' or linefmt == 'wcl': miscutils.fwdie( 'Error: wcl list format not currently supported (%s)' % listfile, 1) else: with open(listfile, 'r') as listfh: for line in listfh: line = line.strip() # convert line into python list lineinfo = [] if linefmt == 'textcsv': lineinfo = miscutils.fwsplit(line, ',') elif linefmt == 'texttab': lineinfo = miscutils.fwsplit(line, '\t') elif linefmt == 'textsp': lineinfo = miscutils.fwsplit(line, ' ') else: miscutils.fwdie('Error: unknown linefmt (%s)' % linefmt, 1) # save each fullname in line for pos in pos2fsect: # use common routine to parse actual fullname (e.g., remove [0]) parsemask = miscutils.CU_PARSE_PATH | miscutils.CU_PARSE_FILENAME | \ miscutils.CU_PARSE_COMPRESSION (path, filename, compression) = miscutils.parse_fullname( lineinfo[pos], parsemask) fname = "%s/%s" % (path, filename) if compression is not None: fname += compression fullnames[pos2fsect[pos]].append(fname) if miscutils.fwdebug_check(6, 'INTGMISC_DEBUG'): miscutils.fwdebug_print('fullnames = %s' % fullnames) return fullnames
def transfer_directory(self, relpath): """ Transfer a directory between two archives Parameters ---------- relpath : str The directory to transfer Returns ------- Dict of the transfer results """ if miscutils.fwdebug_check(0, "ARCHIVE_TRANSFER_GLOBUSONLINE"): miscutils.fwdebug_print("\trelpath: %s" % relpath) srcpath = "%s/%s" % (self.src_archive_info['root'], relpath) dstpath = "%s/%s" % (self.dst_archive_info['root'], relpath) credfile = None if X509_USER_PROXY in self.config: credfile = self.config[X509_USER_PROXY] elif 'X509_USER_PROXY' in os.environ: credfile = os.environ['X509_USER_PROXY'] if credfile is None: miscutils.fwdie( 'Error: Cannot determine location of X509 proxy. Either set in config or environment.', 1) proxy_valid_hrs = 12 if PROXY_VALID_HRS in self.config: proxy_valid_hrs = self.config[PROXY_VALID_HRS] if GO_USER not in self.config: miscutils.fwdie('Error: Missing %s in config' % GO_USER, 1) goclient = globonline.DESGlobusOnline(self.src_archive_info, self.dst_archive_info, credfile, self.config[GO_USER], proxy_valid_hrs) _ = goclient.transfer_directory(srcpath, dstpath) # get listing of remote directory dstlisting = goclient.get_directory_listing( dstpath, self.dst_archive_info['endpoint'], True) retresults = {} for fullname, finfo in dstlisting.items(): filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) if finfo is not None: # include labels required by framework if finfo['type'] == 'file': retresults[filename] = finfo retresults[filename]['filesize'] = retresults[filename][ 'size'] retresults[filename]['fullname'] = fullname # check for missing files srclisting = goclient.get_directory_listing( srcpath, self.src_archive_info['endpoint'], True) for fullname, finfo in srclisting.items(): filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) if finfo is not None and finfo[ 'type'] == 'file' and filename not in retresults: retresults[filename] = finfo retresults[filename]['filesize'] = retresults[filename]['size'] retresults[filename]['fullname'] = fullname retresults[filename]['err'] = 'Unknown error' return retresults
def check_single_valid(keywords, fullname, verbose): # should raise exception if not valid """ Check whether the given file is a valid raw file Parameters ---------- keywords : dict Keywords to look for fullname : str The name of the file verbose : bool Whether or not to print out extra info to stdout Returns ------- bool """ # check fits file hdulist = pyfits.open(fullname) prihdr = hdulist[0].header # check exposure has correct filename (sometimes get NOAO-science-archive renamed exposures) correct_filename = prihdr['FILENAME'] actual_filename = miscutils.parse_fullname(fullname, miscutils.CU_PARSE_FILENAME) if actual_filename != correct_filename: raise ValueError('Error: invalid filename (%s)' % actual_filename) instrume = prihdr['INSTRUME'].lower() req_num_hdus = -1 if instrume == 'decam': req_num_hdus = 71 else: raise ValueError('Error: Unknown instrume (%s)' % instrume) # check # hdus num_hdus = len(hdulist) if num_hdus != req_num_hdus: raise ValueError('Error: Invalid number of hdus (%s)' % num_hdus) # check keywords for hdunum in range(0, num_hdus): hdr = hdulist[hdunum].header (req, want, extra) = check_header_keywords(keywords, hdunum, hdr) if verbose > 1: if want is not None and want: print "HDU #%02d Missing requested keywords: %s" % (hdunum, want) if extra is not None and extra: print "HDU #%02d Extra keywords: %s" % (hdunum, extra) if req is not None and req: raise ValueError( 'Error: HDU #%02d Missing required keywords (%s)' % (hdunum, req)) return True
def get_file_archive_info_path(self, path, arname, compress_order=fmdefs.FM_PREFER_COMPRESSED): """ Get the archive info of a directory Parameters ---------- path : str The path to probe arname : str Name of the archive to look in compress_order : list What order to look for the file in, compressed first or uncompressed first. Default is filemgmt_defs.FM_PREFER_COMPRESSED Returns ------- dict The files and their info """ # sanity checks if 'archive' not in self.config: miscutils.fwdie('Error: Missing archive section in config', 1) if arname not in self.config['archive']: miscutils.fwdie('Error: Invalid archive name (%s)' % arname, 1) if 'root' not in self.config['archive'][arname]: miscutils.fwdie('Error: Missing root in archive def (%s)' % self.config['archive'][arname], 1) if not isinstance(compress_order, list): miscutils.fwdie('Error: Invalid compress_order. It must be a list of compression extensions (including None)', 1) # walk archive to get all files fullnames = {} for p in compress_order: fullnames[p] = {} root = self.config['archive'][arname]['root'] root = root.rstrip("/") # canonicalize - remove trailing / to ensure list_by_name = {} for (dirpath, _, filenames) in os.walk(root + '/' + path): for fname in filenames: d = {} (d['filename'], d['compression']) = miscutils.parse_fullname(fname, 3) d['filesize'] = os.path.getsize("%s/%s" % (dirpath, fname)) d['path'] = dirpath[len(root)+1:] if d['compression'] is None: compext = "" else: compext = d['compression'] d['rel_filename'] = "%s/%s%s" % (d['path'], d['filename'], compext) fullnames[d['compression']][d['filename']] = d list_by_name[d['filename']] = True print "uncompressed:", len(fullnames[None]) print "compressed:", len(fullnames['.fz']) # go through given list of filenames and find archive location and compreesion archiveinfo = {} for name in list_by_name.keys(): #print name for p in compress_order: # follow compression preference #print "p = ", p if name in fullnames[p]: archiveinfo[name] = fullnames[p][name] break print "archiveinfo = ", archiveinfo return archiveinfo