def update_diskspace(dmfilestat, cached=None): """Update diskspace field in dmfilestat object""" try: # search both results directory and raw data directory search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir, ] if not cached: cached = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 # Create a list of files eligible to process # exclude onboard_results folder if thumbnail or if fullchip was reanalyzed from signal processing sigproc_results_dir = os.path.join(dmfilestat.result.get_report_dir(), "sigproc_results") exclude_onboard_results = dmfilestat.result.isThumbnail or ( "onboard_results" not in os.path.realpath(sigproc_results_dir)) for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector( start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], exclude_onboard_results, add_linked_sigproc=True, cached=cached, ) # process files in list for path in to_process[1:]: try: # logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def copy_files_to_destination(source_dir, destination, dmfileset, cached_file_list, log, add_warning): to_process, to_keep = dm_utils._file_selector(source_dir, dmfileset.include, dmfileset.exclude, [], cached=cached_file_list) logger.info('[Data Import] Importing %d files from %s to %s' % (len(to_process), source_dir, destination)) log('Copy files to destination: %d files, source=%s destination=%s' % (len(to_process), source_dir, destination)) plugin_warnings = {} for i, filepath in enumerate(to_process): log('%s' % filepath, flush=True) try: _copy_to_dir(filepath, source_dir, destination) except Exception as e: # log and ignore errors from plugin files if 'plugin_out' in filepath: plugin_name = filepath.split('plugin_out/')[1].split('_out')[0] plugin_warnings[plugin_name] = plugin_warnings.get( plugin_name, 0) + 1 log(traceback.format_exc()) else: raise for plugin, count in plugin_warnings.items(): add_warning('Unable to copy %d files for plugin %s' % (count, plugin)) if dmfileset.type == dmactions_types.OUT: # make sure we have plugin_out folder plugin_out = os.path.join(destination, 'plugin_out') if not os.path.isdir(plugin_out): oldmask = os.umask(0000) #grant write permission to plugin user os.mkdir(plugin_out) os.umask(oldmask) # remove pdf folder, it may have incorrect permissions pdf_dir = os.path.join(destination, 'pdf') if os.path.exists(pdf_dir): shutil.rmtree(pdf_dir, ignore_errors=True) # for onboard results need to create sigproc_results link if dmfileset.type == dmactions_types.BASE: if os.path.exists(os.path.join(destination, 'onboard_results')): os.symlink( os.path.join(destination, 'onboard_results', 'sigproc_results'), os.path.join(destination, 'sigproc_results')) log('Copy files to destination %s done.' % dmfileset.type)
def get_diskspace(source_dir, dmfileset, cached_file_list, add_warning): try: to_process, to_keep = dm_utils._file_selector(source_dir, dmfileset.include, dmfileset.exclude, [], cached=cached_file_list) total_size = 0 for path in to_process: if not os.path.islink(path): total_size += os.lstat(path)[6] diskspace = float(total_size)/(1024*1024) except: logger.error(traceback.format_exc()) add_warning('Error calculating diskspace for %s' % dmfileset.type) diskspace = None return diskspace
def update_diskspace(dmfilestat, cached=None): '''Update diskspace field in dmfilestat object''' try: # search both results directory and raw data directory search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir ] if not cached: cached = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 #Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector( start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], is_thumbnail, add_linked_sigproc=True, cached=cached) #process files in list for path in to_process[1:]: try: #logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def update_diskspace(dmfilestat, cached=None): '''Update diskspace field in dmfilestat object''' try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] if not cached: cached = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 # Create a list of files eligible to process # exclude onboard_results folder if thumbnail or if fullchip was reanalyzed from signal processing sigproc_results_dir = os.path.join(dmfilestat.result.get_report_dir(), 'sigproc_results') exclude_onboard_results = dmfilestat.result.isThumbnail or ('onboard_results' not in os.path.realpath(sigproc_results_dir)) for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], exclude_onboard_results, add_linked_sigproc=True, cached=cached) # process files in list for path in to_process[1:]: try: # logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def get_diskspace(source_dir, dmfileset, cached_file_list, add_warning): try: to_process, to_keep = dm_utils._file_selector( source_dir, dmfileset.include, dmfileset.exclude, [], cached=cached_file_list) total_size = 0 for path in to_process: if not os.path.islink(path): total_size += os.lstat(path)[6] diskspace = float(total_size) / (1024 * 1024) except: logger.error(traceback.format_exc()) add_warning('Error calculating diskspace for %s' % dmfileset.type) diskspace = None return diskspace
def copy_files_to_destination(source_dir, destination, dmfileset, cached_file_list, log, add_warning): to_process, to_keep = dm_utils._file_selector( source_dir, dmfileset.include, dmfileset.exclude, [], cached=cached_file_list) logger.info('[Data Import] Importing %d files from %s to %s' % (len(to_process), source_dir, destination)) log('Copy files to destination: %d files, source=%s destination=%s' % (len(to_process), source_dir, destination)) plugin_warnings = {} for i, filepath in enumerate(to_process): log('%s' % filepath, flush=True) try: _copy_to_dir(filepath, source_dir, destination) except Exception as e: # log and ignore errors from plugin files if 'plugin_out' in filepath: plugin_name = filepath.split('plugin_out/')[1].split('_out')[0] plugin_warnings[plugin_name] = plugin_warnings.get(plugin_name, 0) + 1 log(traceback.format_exc()) else: raise for plugin, count in plugin_warnings.items(): add_warning('Unable to copy %d files for plugin %s' % (count, plugin)) if dmfileset.type == dmactions_types.OUT: # make sure we have plugin_out folder plugin_out = os.path.join(destination, 'plugin_out') if not os.path.isdir(plugin_out): oldmask = os.umask(0000) # grant write permission to plugin user os.mkdir(plugin_out) os.umask(oldmask) # remove pdf folder, it may have incorrect permissions pdf_dir = os.path.join(destination, 'pdf') if os.path.exists(pdf_dir): shutil.rmtree(pdf_dir, ignore_errors=True) # for onboard results need to create sigproc_results link if dmfileset.type == dmactions_types.BASE: if os.path.exists(os.path.join(destination, 'onboard_results')): os.symlink(os.path.join(destination, 'onboard_results', 'sigproc_results'), os.path.join(destination, 'sigproc_results')) log('Copy files to destination %s done.' % dmfileset.type)
def get_file_list(dmfilestat): """Return list of files selected by this DMFileStat record and list of files to not process. There are some cases in which the list of selected files contains files which should not be processed. Those are in the to_keep list.""" logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra=logid) to_process = [] to_keep = [] try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) except: # If this function has an error, this file set should be marked 'E' dmfilestat.setactionstate('E') logger.error(traceback.format_exc(), extra=logid) return (to_process, to_keep) try: # Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, '') # Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: if os.path.isdir(start_dir): tmp_process, tmp_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, cached=cached_file_list) to_process += tmp_process to_keep += tmp_keep else: logger.error(traceback.format_exc(), extra=logid) except: logger.error(traceback.format_exc(), extra=logid) return (to_process, to_keep)
def get_file_list(dmfilestat): """Return list of files selected by this DMFileStat record and list of files to not process. There are some cases in which the list of selected files contains files which should not be processed. Those are in the to_keep list.""" logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra = logid) to_process = [] to_keep = [] try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) except: # If this function has an error, this file set should be marked 'E' dmfilestat.setactionstate('E') logger.error(traceback.format_exc(), extra = logid) return (to_process, to_keep) try: #Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, '') #Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: if os.path.isdir(start_dir): tmp_process, tmp_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, cached=cached_file_list) to_process += tmp_process to_keep += tmp_keep else: logger.error(traceback.format_exc(), extra = logid) except: logger.error(traceback.format_exc(), extra = logid) return (to_process, to_keep)
def search_for_files(dmfilestats, reset, report): '''Look for files for the given DM category still in the filesystem. This is the long-lived function so we enable ctrl-c interrupt to exit the loop and still write the log file. ''' try: print("Ctrl-C to exit") tracking = [] num_dmfs = len(dmfilestats) for i, dmfs in enumerate(dmfilestats): sys.stdout.write("\r%05d/%05d %s" % (i + 1, num_dmfs, progress[i % 7])) sys.stdout.flush() to_process = [] to_keep = [] # For each dmfilestat object, check if files still exist in filesystem # 1. Do not rely on cache.filelist dirs = [ dmfs.result.get_report_dir(), dmfs.result.experiment.expDir ] for start_dir in [dir for dir in dirs if os.path.isdir(dir)]: tmp_process, tmp_keep = _file_selector( start_dir, dmfs.dmfileset.include, dmfs.dmfileset.exclude, _get_keeper_list(dmfs, 'delete'), dmfs.result.isThumbnail, False, cached=get_walk_filelist(dirs)) to_process += tmp_process to_keep += tmp_keep orphans = list(set(to_process) - set(to_keep)) logs = models.EventLog.objects.for_model(dmfs.result) # We only want to track those datasets with lots of files displaced. if len(orphans) > 10: #if dmfs.action_state in ['DD', 'AD']: # Is it marked Deleted? if dmfs.action_state in ['DD']: # Is it marked Deleted? print "\nReport: %s" % (dmfs.result.resultsName) print "Report Directory: %s" % dmfs.result.get_report_dir() print "Status: %s" % 'Deleted' if dmfs.action_state == 'DD' else 'Archived' print "Category: %s" % dmfs.dmfileset.type print "Raw Data Directory: %s" % dmfs.result.experiment.expDir print "No. files: %d" % len(orphans) print "Action Date: %s" % logs[len(logs) - 1].created print "Action Log: %s" % logs[len(logs) - 1].text tracking.append({ 'report': dmfs.result.resultsName, 'report_dir': dmfs.result.get_report_dir(), 'state': 'Deleted' if dmfs.action_state == 'DD' else 'Archived', 'rawdatadir': dmfs.result.experiment.expDir, 'num_files': len(orphans), 'reset': reset, 'action_state': dmfs.action_state, 'action_date': '%s' % logs[len(logs) - 1].created, 'action_text': logs[len(logs) - 1].text }) if reset: try: print "Deleting the cached.filelist file" cachefilename = os.path.join( dmfs.result.get_report_dir(), "cached.filelist") if os.path.exists(cachefilename): #os.unlink(cachefilename) os.rename(cachefilename, cachefilename + ".hide") except OSError: print traceback.format_exc() dmfs.action_state = "L" if dmfs.action_state == 'DD' else "SA" dmfs.save() print "Reset to %s: %s" % (dmfs.action_state, dmfs.result.resultsName) if not report: for entry in orphans: print entry elif len(orphans) > 0: if not report: print "\rLeft-overs Report: %s" % dmfs.result.resultsName for entry in orphans: print entry sys.stdout.write("\n ") except (KeyboardInterrupt): pass except: print traceback.format_exc() finally: return tracking
def search_for_files(dmfilestats, reset, report): '''Look for files for the given DM category still in the filesystem. This is the long-lived function so we enable ctrl-c interrupt to exit the loop and still write the log file. ''' try: print ("Ctrl-C to exit") tracking = [] num_dmfs = len(dmfilestats) for i, dmfs in enumerate(dmfilestats): sys.stdout.write("\r%05d/%05d %s" % (i + 1, num_dmfs, progress[i % 7])) sys.stdout.flush() to_process = [] to_keep = [] # For each dmfilestat object, check if files still exist in filesystem # 1. Do not rely on cache.filelist dirs = [dmfs.result.get_report_dir(), dmfs.result.experiment.expDir] for start_dir in [dir for dir in dirs if os.path.isdir(dir)]: tmp_process, tmp_keep = _file_selector(start_dir, dmfs.dmfileset.include, dmfs.dmfileset.exclude, _get_keeper_list(dmfs, 'delete'), dmfs.result.isThumbnail, False, cached=get_walk_filelist(dirs)) to_process += tmp_process to_keep += tmp_keep orphans = list(set(to_process) - set(to_keep)) logs = models.EventLog.objects.for_model(dmfs.result) # We only want to track those datasets with lots of files displaced. if len(orphans) > 10: # if dmfs.action_state in ['DD', 'AD']: # Is it marked Deleted? if dmfs.action_state in ['DD']: # Is it marked Deleted? print "\nReport: %s" % (dmfs.result.resultsName) print "Report Directory: %s" % dmfs.result.get_report_dir() print "Status: %s" % 'Deleted' if dmfs.action_state == 'DD' else 'Archived' print "Category: %s" % dmfs.dmfileset.type print "Raw Data Directory: %s" % dmfs.result.experiment.expDir print "No. files: %d" % len(orphans) print "Action Date: %s" % logs[len(logs) - 1].created print "Action Log: %s" % logs[len(logs) - 1].text tracking.append({'report': dmfs.result.resultsName, 'report_dir': dmfs.result.get_report_dir(), 'state': 'Deleted' if dmfs.action_state == 'DD' else 'Archived', 'rawdatadir': dmfs.result.experiment.expDir, 'num_files': len(orphans), 'reset': reset, 'action_state': dmfs.action_state, 'action_date': '%s' % logs[len(logs) - 1].created, 'action_text': logs[len(logs) - 1].text}) if reset: try: print "Deleting the cached.filelist file" cachefilename = os.path.join(dmfs.result.get_report_dir(), "cached.filelist") if os.path.exists(cachefilename): # os.unlink(cachefilename) os.rename(cachefilename, cachefilename + ".hide") except OSError: print traceback.format_exc() dmfs.action_state = "L" if dmfs.action_state == 'DD' else "SA" dmfs.save() print "Reset to %s: %s" % (dmfs.action_state, dmfs.result.resultsName) if not report: for entry in orphans: print entry elif len(orphans) > 0: if not report: print "\rLeft-overs Report: %s" % dmfs.result.resultsName for entry in orphans: print entry sys.stdout.write("\n ") except (KeyboardInterrupt): pass except: print traceback.format_exc() finally: return tracking
def _get_file_list_dict(dmfilestat, action, user, user_comment, msg_banner): ''' This function generates a list of files to process. ''' logid = {'logid': "%s" % ('dmactions')} logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra=logid) if dmfilestat.isdeleted(): errmsg = "The %s for %s are deleted" % (dmfilestat.dmfileset.type, dmfilestat.result.resultsName) logger.warn(errmsg, extra=logid) raise Exception(errmsg) elif dmfilestat.isarchived(): if not os.path.exists(dmfilestat.archivepath): errmsg = "Cannot access backup location %s" % dmfilestat.archivepath logger.warn(errmsg, extra=logid) raise Exception(errmsg) else: # search archived directory search_dirs = [dmfilestat.archivepath] else: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] # List of all files associated with the report cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) # Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, action) # Create a list of files eligible to process list_of_file_dict = [] is_thumbnail = dmfilestat.result.isThumbnail add_linked_sigproc = False if ( action == DELETE or dmfilestat.dmfileset.type == dmactions_types.INTR) else True for start_dir in search_dirs: logger.debug("Searching: %s" % start_dir, extra=logid) to_process = [] to_keep = [] if os.path.isdir(start_dir): to_process, to_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, add_linked_sigproc, cached=cached_file_list) logger.info("%d files to process at %s" % (len(list(set(to_process) - set(to_keep))), start_dir), extra=logid) list_of_file_dict.append( { 'pk': dmfilestat.id, 'action': action, 'archivepath': dmfilestat.archivepath, 'start_dir': start_dir, 'to_process': to_process, 'to_keep': to_keep, 'total_cnt': len(list(set(to_process) - set(to_keep))), 'processed_cnt': 0, 'total_size': 0, 'user': user, 'user_comment': user_comment, 'lockfile': '', 'msg_banner': msg_banner, } ) return list_of_file_dict
def _process_fileset_task(dmfilestat, action, user, user_comment, lockfile, msg_banner): ''' This function generates a list of files to process, then hands the list to a recursive celery task function. The recursion continues until the list is empty. The calling function exits immediately. ''' logid = {'logid':"%s" % (lockfile)} logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra = logid) if dmfilestat.isdeleted(): errmsg = "The %s for %s are deleted" % (dmfilestat.dmfileset.type, dmfilestat.result.resultsName) logger.warn(errmsg, extra = logid) raise Exception(errmsg) elif dmfilestat.isarchived(): if not os.path.exists(dmfilestat.archivepath): errmsg = "Cannot access backup location %s" % dmfilestat.archivepath logger.warn(errmsg, extra = logid) raise Exception(errmsg) else: # search archived directory search_dirs = [dmfilestat.archivepath] else: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] # Create a lock file here to block any other actions on this report (see TS-8411) lock_id = "%s_%s" % (dmfilestat.result.resultsName, dm_utils.slugify(dmfilestat.dmfileset.type)) locallock = TaskLock(lock_id, timeout=60) # short timeout in case lock release code doesn't get called if not(locallock.lock()): logger.warn("lock file exists: %s(%s)" % (lock_id, locallock.get()), extra = logid) # Release the task lock try: applock = TaskLock(lockfile) applock.unlock() except: logger.error(traceback.format_exc(), extra = logid) return logger.info("lock file created: %s(%s)" % (lock_id, locallock.get()), extra = logid) if action == ARCHIVE: dmfilestat.setactionstate('AG') elif action == DELETE: dmfilestat.setactionstate('DG') elif action == EXPORT: dmfilestat.setactionstate('EG') # List of all files associated with the report cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) #Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, action) #Create a list of files eligible to process list_of_file_dict = [] is_thumbnail = dmfilestat.result.isThumbnail add_linked_sigproc = False if (action == DELETE or dmfilestat.dmfileset.type == dmactions_types.INTR) else True for start_dir in search_dirs: logger.debug("Searching: %s" % start_dir, extra = logid) to_process = [] to_keep = [] if os.path.isdir(start_dir): to_process, to_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, add_linked_sigproc, cached=cached_file_list) logger.info("%d files to process at %s" % (len(list(set(to_process) - set(to_keep))), start_dir), extra = logid) list_of_file_dict.append( { 'pk':dmfilestat.id, 'action':action, 'archivepath':dmfilestat.archivepath, 'start_dir':start_dir, 'to_process':to_process, 'to_keep':to_keep, 'total_cnt':len(list(set(to_process) - set(to_keep))), 'processed_cnt':0, 'total_size':0, 'user':user, 'user_comment':user_comment, 'lockfile':lockfile, 'msg_banner':msg_banner, } ) try: pfilename = set_action_param_var(list_of_file_dict) # Call the recursive celery task function to process the list _process_task.delay(pfilename) except: logger.error("We got an error here, _process_fileset_task", extra = logid) raise finally: if locallock: locallock.unlock() return