def update_dmfilestat_diskusage(resultpk): ''' Task to update DMFileStat.diskspace for all associated with this resultpk This task is launched at the end of pipeline execution. NOTE: This can be a long-lived task ''' logid = {'logid':"%s" % ('tasks')} try: result = Results.objects.get(pk=resultpk) search_dirs = [result.get_report_dir(), result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=result.get_report_dir(), save_list=True) for dmtype in FILESET_TYPES: dmfilestat = result.get_filestat(dmtype) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except SoftTimeLimitExceeded: logger.warn("Time exceeded update_diskusage for (%d) %s" % (resultpk,result.resultsName), extra = logid) except: raise try: disk_total = 0 for dmfilestat in [ result.get_filestat(dmtype) for dmtype in FILESET_TYPES]: if dmfilestat.dmfileset.type == dmactions_types.SIG: dmfilestat.result.experiment.diskusage = dmfilestat.diskspace if dmfilestat.diskspace != None else 0 dmfilestat.result.experiment.save() else: partial = dmfilestat.diskspace disk_total += int(partial) if partial != None else 0 result.diskusage = disk_total result.save() # See dmaction._update_related_objects() which also updates Exp & Results diskusage fields except: logger.error(traceback.format_exc(), extra = logid) raise
def update_dmfilestats_diskspace(dmfilestat): ''' Task to update DMFileStat.diskspace ''' search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] try: cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except: logger.exception(traceback.format_exc(), extra = logid)
def update_diskspace(dmfilestat, cached=None): """Update diskspace field in dmfilestat object""" try: # search both results directory and raw data directory search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir, ] if not cached: cached = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 # Create a list of files eligible to process # exclude onboard_results folder if thumbnail or if fullchip was reanalyzed from signal processing sigproc_results_dir = os.path.join(dmfilestat.result.get_report_dir(), "sigproc_results") exclude_onboard_results = dmfilestat.result.isThumbnail or ( "onboard_results" not in os.path.realpath(sigproc_results_dir)) for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector( start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], exclude_onboard_results, add_linked_sigproc=True, cached=cached, ) # process files in list for path in to_process[1:]: try: # logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def update_dmfilestats_diskspace(dmfilestat): ''' Task to update DMFileStat.diskspace ''' search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir ] try: cached_file_list = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except: logger.exception(traceback.format_exc(), extra=logid)
def backfill_dmfilestats_diskspace(): ''' Backfill records with DMFileStat.diskspace = None, one at a time These could be older data sets or new ones where update_diskusage task failed ''' dmfilestats = DMFileStat.objects.filter(diskspace=None, action_state='L', files_in_use='').order_by('-created') if dmfilestats.count() > 0: dmfilestat = dmfilestats[0] search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] try: cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir(), save_list=True) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except: logger.error(traceback.format_exc(), extra = logid) raise
def update_diskspace(dmfilestat, cached=None): '''Update diskspace field in dmfilestat object''' try: # search both results directory and raw data directory search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir ] if not cached: cached = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 #Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector( start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], is_thumbnail, add_linked_sigproc=True, cached=cached) #process files in list for path in to_process[1:]: try: #logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def update_diskspace(dmfilestat, cached=None): '''Update diskspace field in dmfilestat object''' try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] if not cached: cached = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) total_size = 0 # Create a list of files eligible to process # exclude onboard_results folder if thumbnail or if fullchip was reanalyzed from signal processing sigproc_results_dir = os.path.join(dmfilestat.result.get_report_dir(), 'sigproc_results') exclude_onboard_results = dmfilestat.result.isThumbnail or ('onboard_results' not in os.path.realpath(sigproc_results_dir)) for start_dir in search_dirs: to_process = [] if os.path.isdir(start_dir): to_process, _ = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, [], exclude_onboard_results, add_linked_sigproc=True, cached=cached) # process files in list for path in to_process[1:]: try: # logger.debug("%d %s %s" % (j, 'diskspace', path), extra = logid) if not os.path.islink(path): total_size += os.lstat(path)[6] except Exception as inst: if inst.errno == errno.ENOENT: pass else: errmsg = "update_diskspace %s" % (inst) logger.error(errmsg, extra=logid) diskspace = float(total_size) / (1024 * 1024) except: diskspace = None raise finally: dmfilestat.diskspace = diskspace dmfilestat.save() return diskspace
def update_dmfilestat_diskusage(resultpk): """ Task to update DMFileStat.diskspace for all associated with this resultpk This task is launched at the end of pipeline execution. NOTE: This can be a long-lived task """ logid = {"logid": "%s" % ("tasks")} try: result = Results.objects.get(pk=resultpk) search_dirs = [result.get_report_dir(), result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist( search_dirs, list_dir=result.get_report_dir(), save_list=True) for dmtype in FILESET_TYPES: dmfilestat = result.get_filestat(dmtype) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except SoftTimeLimitExceeded: logger.warn( "Time exceeded update_diskusage for (%d) %s" % (resultpk, result.resultsName), extra=logid, ) except: raise try: disk_total = 0 for dmfilestat in [ result.get_filestat(dmtype) for dmtype in FILESET_TYPES ]: if dmfilestat.dmfileset.type == dmactions_types.SIG: dmfilestat.result.experiment.diskusage = ( dmfilestat.diskspace if dmfilestat.diskspace != None else 0) dmfilestat.result.experiment.save() else: partial = dmfilestat.diskspace disk_total += int(partial) if partial != None else 0 result.diskusage = disk_total result.save() # See dmaction._update_diskspace_and_diskusage() which also updates Exp & Results diskusage fields except Exception: logger.error(traceback.format_exc(), extra=logid) raise
def get_file_list(dmfilestat): """Return list of files selected by this DMFileStat record and list of files to not process. There are some cases in which the list of selected files contains files which should not be processed. Those are in the to_keep list.""" logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra=logid) to_process = [] to_keep = [] try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir()) except: # If this function has an error, this file set should be marked 'E' dmfilestat.setactionstate('E') logger.error(traceback.format_exc(), extra=logid) return (to_process, to_keep) try: # Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, '') # Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: if os.path.isdir(start_dir): tmp_process, tmp_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, cached=cached_file_list) to_process += tmp_process to_keep += tmp_keep else: logger.error(traceback.format_exc(), extra=logid) except: logger.error(traceback.format_exc(), extra=logid) return (to_process, to_keep)
def get_file_list(dmfilestat): """Return list of files selected by this DMFileStat record and list of files to not process. There are some cases in which the list of selected files contains files which should not be processed. Those are in the to_keep list.""" logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra = logid) to_process = [] to_keep = [] try: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) except: # If this function has an error, this file set should be marked 'E' dmfilestat.setactionstate('E') logger.error(traceback.format_exc(), extra = logid) return (to_process, to_keep) try: #Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, '') #Create a list of files eligible to process is_thumbnail = dmfilestat.result.isThumbnail for start_dir in search_dirs: if os.path.isdir(start_dir): tmp_process, tmp_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, cached=cached_file_list) to_process += tmp_process to_keep += tmp_keep else: logger.error(traceback.format_exc(), extra = logid) except: logger.error(traceback.format_exc(), extra = logid) return (to_process, to_keep)
def backfill_dmfilestats_diskspace(): ''' Backfill records with DMFileStat.diskspace = None, one at a time These could be older data sets or new ones where update_diskusage task failed ''' dmfilestats = DMFileStat.objects.filter( diskspace=None, action_state='L', files_in_use='').order_by('-created') if dmfilestats.count() > 0: dmfilestat = dmfilestats[0] search_dirs = [ dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir ] try: cached_file_list = dm_utils.get_walk_filelist( search_dirs, list_dir=dmfilestat.result.get_report_dir(), save_list=True) dmfilestat_utils.update_diskspace(dmfilestat, cached=cached_file_list) except: logger.error(traceback.format_exc(), extra=logid) raise
def data_import(name, selected, username, copy_data=False, copy_report=True): ''' Data import main task. Selected dict contains categories to import and path to their serialized json Log file can be used to display progress on webpage. Copy options: if copy_data=True copy Signal Processing or Basecalling Input files to local drive, otherwise mark these categories Archived if copy_report=True copy Output files to local drive, otherwise mark it Archived and copy/create report.pdf ''' importing = ImportData(name, selected, username, copy_data, copy_report) importing.start() importing.log('Selected: %s, copy data: %s, copy report: %s.' % (importing.selected_str, copy_data, copy_report) ) logger.info('[Data Import] (%s) Started import %s using %s.' % (name, importing.selected_str, importing.json_path) ) # create DB records try: objs = load_serialized_json(importing.json_path, importing.createResult, importing.log, importing.add_warning) result = objs.get('results', None) exp = objs['experiment'] importing.update_destinations(result, exp) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg) logger.error(msg) return if result: result.dmfilestat_set.filter(dmfileset__type__in=selected.keys()).update(action_state='IG') EventLog.objects.add_entry(result, "Importing %s %s." % (name, importing.selected_str), username) # get list of files file_list = [] if copy_data or copy_report: source_paths = set([c['src_path'] for c in importing.categories if c['copy_files']]) file_list = get_walk_filelist(list(source_paths), list_dir=False, save_list=False) # calculate dmfilestat diskspace importing.update_diskspace(file_list, result) importing.log('Selected categories:' + json.dumps(importing.categories,indent=1)) # destination validation try: validate_destination(importing.categories) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg, result) logger.error(msg) # copy files to destination for category in importing.categories: dmtype = category['dmtype'] source_dir = category['src_path'] destination = category['dest_path'] if result: dmfilestat = result.get_filestat(dmtype) dmfileset = dmfilestat.dmfileset else: dmfilestat = None dmfileset = DMFileSet.objects.get(version=RELVERSION, type=dmtype) # process files if category['copy_files']: importing.log('Start processing files for %s.' % dmtype) if not os.path.exists(source_dir): error = "Source path %s does not exist, exiting." % source_dir importing.fail(error,error,result) return try: copy_files_to_destination(source_dir, destination, dmfileset, file_list, importing.log, importing.add_warning) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg, result) logger.error(msg) return elif dmtype == dmactions_types.OUT: # special case: importing Report as Archived (copy_report=False) try: generate_report_pdf(source_dir, result, dmfilestat, importing.log, importing.add_warning) except: importing.add_warning('Failed to generate report pdf') importing.log(traceback.format_exc()) # update database objects; DM state is Local if files copied, otherwise Archived importing.log('Updating location of imported files') if result: for category in importing.categories: dmfilestat = result.get_filestat(category['dmtype']) if category['copy_files']: dmfilestat.action_state = 'L' dmfilestat.created = timezone.now() else: # data files left on media, need to update dmfilestat to archived location dmfilestat.action_state = 'AD' dmfilestat.archivepath=category['src_path'] dmfilestat.diskspace = category['diskspace'] dmfilestat.save() result.status = 'Completed' result.save() elif dmactions_types.SIG in selected.keys() and not copy_data: # only Sigproc imported (no dmfilestats) and data files not copied exp.expDir = os.path.dirname(importing.json_path) exp.save() # finish up importing.finish(result, exp) logger.info('[Data Import] (%s) Done.' % name)
def search_for_files(dmfilestats, reset, report): '''Look for files for the given DM category still in the filesystem. This is the long-lived function so we enable ctrl-c interrupt to exit the loop and still write the log file. ''' try: print("Ctrl-C to exit") tracking = [] num_dmfs = len(dmfilestats) for i, dmfs in enumerate(dmfilestats): sys.stdout.write("\r%05d/%05d %s" % (i + 1, num_dmfs, progress[i % 7])) sys.stdout.flush() to_process = [] to_keep = [] # For each dmfilestat object, check if files still exist in filesystem # 1. Do not rely on cache.filelist dirs = [ dmfs.result.get_report_dir(), dmfs.result.experiment.expDir ] for start_dir in [dir for dir in dirs if os.path.isdir(dir)]: tmp_process, tmp_keep = _file_selector( start_dir, dmfs.dmfileset.include, dmfs.dmfileset.exclude, _get_keeper_list(dmfs, 'delete'), dmfs.result.isThumbnail, False, cached=get_walk_filelist(dirs)) to_process += tmp_process to_keep += tmp_keep orphans = list(set(to_process) - set(to_keep)) logs = models.EventLog.objects.for_model(dmfs.result) # We only want to track those datasets with lots of files displaced. if len(orphans) > 10: #if dmfs.action_state in ['DD', 'AD']: # Is it marked Deleted? if dmfs.action_state in ['DD']: # Is it marked Deleted? print "\nReport: %s" % (dmfs.result.resultsName) print "Report Directory: %s" % dmfs.result.get_report_dir() print "Status: %s" % 'Deleted' if dmfs.action_state == 'DD' else 'Archived' print "Category: %s" % dmfs.dmfileset.type print "Raw Data Directory: %s" % dmfs.result.experiment.expDir print "No. files: %d" % len(orphans) print "Action Date: %s" % logs[len(logs) - 1].created print "Action Log: %s" % logs[len(logs) - 1].text tracking.append({ 'report': dmfs.result.resultsName, 'report_dir': dmfs.result.get_report_dir(), 'state': 'Deleted' if dmfs.action_state == 'DD' else 'Archived', 'rawdatadir': dmfs.result.experiment.expDir, 'num_files': len(orphans), 'reset': reset, 'action_state': dmfs.action_state, 'action_date': '%s' % logs[len(logs) - 1].created, 'action_text': logs[len(logs) - 1].text }) if reset: try: print "Deleting the cached.filelist file" cachefilename = os.path.join( dmfs.result.get_report_dir(), "cached.filelist") if os.path.exists(cachefilename): #os.unlink(cachefilename) os.rename(cachefilename, cachefilename + ".hide") except OSError: print traceback.format_exc() dmfs.action_state = "L" if dmfs.action_state == 'DD' else "SA" dmfs.save() print "Reset to %s: %s" % (dmfs.action_state, dmfs.result.resultsName) if not report: for entry in orphans: print entry elif len(orphans) > 0: if not report: print "\rLeft-overs Report: %s" % dmfs.result.resultsName for entry in orphans: print entry sys.stdout.write("\n ") except (KeyboardInterrupt): pass except: print traceback.format_exc() finally: return tracking
def search_for_files(dmfilestats, reset, report): '''Look for files for the given DM category still in the filesystem. This is the long-lived function so we enable ctrl-c interrupt to exit the loop and still write the log file. ''' try: print ("Ctrl-C to exit") tracking = [] num_dmfs = len(dmfilestats) for i, dmfs in enumerate(dmfilestats): sys.stdout.write("\r%05d/%05d %s" % (i + 1, num_dmfs, progress[i % 7])) sys.stdout.flush() to_process = [] to_keep = [] # For each dmfilestat object, check if files still exist in filesystem # 1. Do not rely on cache.filelist dirs = [dmfs.result.get_report_dir(), dmfs.result.experiment.expDir] for start_dir in [dir for dir in dirs if os.path.isdir(dir)]: tmp_process, tmp_keep = _file_selector(start_dir, dmfs.dmfileset.include, dmfs.dmfileset.exclude, _get_keeper_list(dmfs, 'delete'), dmfs.result.isThumbnail, False, cached=get_walk_filelist(dirs)) to_process += tmp_process to_keep += tmp_keep orphans = list(set(to_process) - set(to_keep)) logs = models.EventLog.objects.for_model(dmfs.result) # We only want to track those datasets with lots of files displaced. if len(orphans) > 10: # if dmfs.action_state in ['DD', 'AD']: # Is it marked Deleted? if dmfs.action_state in ['DD']: # Is it marked Deleted? print "\nReport: %s" % (dmfs.result.resultsName) print "Report Directory: %s" % dmfs.result.get_report_dir() print "Status: %s" % 'Deleted' if dmfs.action_state == 'DD' else 'Archived' print "Category: %s" % dmfs.dmfileset.type print "Raw Data Directory: %s" % dmfs.result.experiment.expDir print "No. files: %d" % len(orphans) print "Action Date: %s" % logs[len(logs) - 1].created print "Action Log: %s" % logs[len(logs) - 1].text tracking.append({'report': dmfs.result.resultsName, 'report_dir': dmfs.result.get_report_dir(), 'state': 'Deleted' if dmfs.action_state == 'DD' else 'Archived', 'rawdatadir': dmfs.result.experiment.expDir, 'num_files': len(orphans), 'reset': reset, 'action_state': dmfs.action_state, 'action_date': '%s' % logs[len(logs) - 1].created, 'action_text': logs[len(logs) - 1].text}) if reset: try: print "Deleting the cached.filelist file" cachefilename = os.path.join(dmfs.result.get_report_dir(), "cached.filelist") if os.path.exists(cachefilename): # os.unlink(cachefilename) os.rename(cachefilename, cachefilename + ".hide") except OSError: print traceback.format_exc() dmfs.action_state = "L" if dmfs.action_state == 'DD' else "SA" dmfs.save() print "Reset to %s: %s" % (dmfs.action_state, dmfs.result.resultsName) if not report: for entry in orphans: print entry elif len(orphans) > 0: if not report: print "\rLeft-overs Report: %s" % dmfs.result.resultsName for entry in orphans: print entry sys.stdout.write("\n ") except (KeyboardInterrupt): pass except: print traceback.format_exc() finally: return tracking
def data_import(name, selected, username, copy_data=False, copy_report=True): ''' Data import main task. Selected dict contains categories to import and path to their serialized json Log file can be used to display progress on webpage. Copy options: if copy_data=True copy Signal Processing or Basecalling Input files to local drive, otherwise mark these categories Archived if copy_report=True copy Output files to local drive, otherwise mark it Archived and copy/create report.pdf ''' importing = ImportData(name, selected, username, copy_data, copy_report) importing.start() importing.log('Selected: %s, copy data: %s, copy report: %s.' % (importing.selected_str, copy_data, copy_report)) logger.info( '[Data Import] (%s) Started import %s using %s, copy data: %s, copy report: %s.' % (name, importing.selected_str, importing.json_path, copy_data, copy_report)) # create DB records try: objs = load_serialized_json(importing.json_path, importing.createResult, importing.log, importing.add_warning) result = objs.get('results', None) exp = objs['experiment'] importing.update_destinations(result, exp) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg) if result: dmfilestats_to_import = result.dmfilestat_set.filter( dmfileset__type__in=selected.keys()) # check if importing is allowed for dmfilestat in dmfilestats_to_import: if dmfilestat.in_process(): error = "In Process: %s status is %s" % ( dmfilestat.dmfileset.type, dmfilestat.get_action_state_display()) importing.fail(error, error) # set status dmfilestats_to_import.update(action_state='IG') result.status = 'Importing' result.save() EventLog.objects.add_entry( result, "Importing %s %s." % (name, importing.selected_str), username) # get list of files file_list = [] if copy_data or copy_report: source_paths = set( [c['src_path'] for c in importing.categories if c['copy_files']]) file_list = get_walk_filelist(list(source_paths), list_dir=False, save_list=False) # calculate dmfilestat diskspace importing.update_diskspace(file_list, result) importing.log('Selected categories:' + json.dumps(importing.categories, indent=1)) # destination validation try: validate_destination(importing.categories) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg, result) # copy files to destination for category in importing.categories: dmtype = category['dmtype'] source_dir = category['src_path'] destination = category['dest_path'] if result: dmfilestat = result.get_filestat(dmtype) dmfileset = dmfilestat.dmfileset else: dmfilestat = None dmfileset = DMFileSet.objects.get(version=RELVERSION, type=dmtype) # process files if category['copy_files']: importing.log('Start processing files for %s.' % dmtype) if not os.path.exists(source_dir): error = "Source path %s does not exist, exiting." % source_dir importing.fail(error, error, result) try: copy_files_to_destination(source_dir, destination, dmfileset, file_list, importing.log, importing.add_warning) except Exception as e: msg = traceback.format_exc() importing.fail(e, msg, result) elif dmtype == dmactions_types.OUT: # special case: importing Report as Archived (copy_report=False) try: generate_report_pdf(source_dir, result, dmfilestat, importing.log, importing.add_warning) except: importing.add_warning('Failed to generate report pdf') importing.log(traceback.format_exc()) # update database objects; DM state is Local if files copied, otherwise Archived importing.log('Updating location of imported files') if result: for category in importing.categories: dmfilestat = result.get_filestat(category['dmtype']) if category['copy_files']: dmfilestat.action_state = 'L' dmfilestat.created = timezone.now() else: # data files left on media, need to update dmfilestat to archived location dmfilestat.action_state = 'AD' dmfilestat.archivepath = category['src_path'] dmfilestat.diskspace = category['diskspace'] dmfilestat.save() result.status = 'Completed' result.save() elif dmactions_types.SIG in selected.keys() and not copy_data: # only Sigproc imported (no dmfilestats) and data files not copied exp.expDir = os.path.dirname(importing.json_path) exp.save() # finish up importing.finish(result, exp) logger.info('[Data Import] (%s) Done.' % name)
def process_import(importing, copy_data, copy_report): # create DB records try: objs = load_serialized_json( importing.json_path, importing.createResult, importing.log, importing.add_warning) result = objs.get('results', None) exp = objs['experiment'] importing.update_destinations(result, exp) except Exception as e: raise if result: dmfilestats_to_import = result.dmfilestat_set.filter(dmfileset__type__in=importing.dmtypes) # check if importing is allowed for dmfilestat in dmfilestats_to_import: if dmfilestat.action_state in ['AG', 'DG', 'EG', 'SA', 'SE', 'SD']: raise Exception("Cannot import %s when data is in process: %s" % (dmfilestat.dmfileset.type, dmfilestat.get_action_state_display())) # set status dmfilestats_to_import.update(action_state='IG') result.status = 'Importing' result.save() EventLog.objects.add_entry(result, "Importing %s %s." % (importing.name, importing.selected_str), importing.user) # get list of files file_list = [] if copy_data or copy_report: source_paths = set([c['src_path'] for c in importing.categories if c['copy_files']]) file_list = get_walk_filelist(list(source_paths), list_dir=False, save_list=False) # calculate dmfilestat diskspace importing.update_diskspace(file_list) importing.log('Selected categories:' + json.dumps(importing.categories, indent=1)) # destination validation try: validate_destination(importing.categories) except: raise # copy files to destination for category in importing.categories: dmtype = category['dmtype'] source_dir = category['src_path'] destination = category['dest_path'] if result: dmfilestat = result.get_filestat(dmtype) dmfileset = dmfilestat.dmfileset else: dmfilestat = None dmfileset = DMFileSet.objects.get(version=RELVERSION, type=dmtype) # process files if category['copy_files']: importing.log('Start processing files for %s.' % dmtype) if not os.path.exists(source_dir): raise Exception("Source path %s does not exist, exiting." % source_dir) try: copy_files_to_destination( source_dir, destination, dmfileset, file_list, importing.log, importing.add_warning) except: raise elif dmtype == dmactions_types.OUT: # special case: importing Report as Archived (copy_report=False) try: generate_report_pdf(source_dir, result, dmfilestat, importing.log, importing.add_warning) except: importing.add_warning('Failed to generate report pdf') importing.log(traceback.format_exc()) # update database objects; DM state is Local if files copied, otherwise Archived importing.log('Updating location of imported files') if result: for category in importing.categories: dmfilestat = result.get_filestat(category['dmtype']) if category['copy_files']: dmfilestat.action_state = 'L' dmfilestat.created = timezone.now() else: # data files left on media, need to update dmfilestat to archived location dmfilestat.action_state = 'AD' dmfilestat.archivepath = category['src_path'] dmfilestat.diskspace = category['diskspace'] dmfilestat.save() result.status = 'Completed' result.save() elif dmactions_types.SIG in importing.dmtypes: if copy_data: # if any results exist for this data set, need to update their dmfilestats DMFileStat.objects.filter( dmfileset__type=dmactions_types.SIG, result__experiment=exp).update(action_state='L') else: # only Sigproc imported (no dmfilestats) and data files not copied exp.expDir = os.path.dirname(importing.json_path) exp.save()
def _get_file_list_dict(dmfilestat, action, user, user_comment, msg_banner): ''' This function generates a list of files to process. ''' logid = {'logid': "%s" % ('dmactions')} logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra=logid) if dmfilestat.isdeleted(): errmsg = "The %s for %s are deleted" % (dmfilestat.dmfileset.type, dmfilestat.result.resultsName) logger.warn(errmsg, extra=logid) raise Exception(errmsg) elif dmfilestat.isarchived(): if not os.path.exists(dmfilestat.archivepath): errmsg = "Cannot access backup location %s" % dmfilestat.archivepath logger.warn(errmsg, extra=logid) raise Exception(errmsg) else: # search archived directory search_dirs = [dmfilestat.archivepath] else: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] # List of all files associated with the report cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) # Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, action) # Create a list of files eligible to process list_of_file_dict = [] is_thumbnail = dmfilestat.result.isThumbnail add_linked_sigproc = False if ( action == DELETE or dmfilestat.dmfileset.type == dmactions_types.INTR) else True for start_dir in search_dirs: logger.debug("Searching: %s" % start_dir, extra=logid) to_process = [] to_keep = [] if os.path.isdir(start_dir): to_process, to_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, add_linked_sigproc, cached=cached_file_list) logger.info("%d files to process at %s" % (len(list(set(to_process) - set(to_keep))), start_dir), extra=logid) list_of_file_dict.append( { 'pk': dmfilestat.id, 'action': action, 'archivepath': dmfilestat.archivepath, 'start_dir': start_dir, 'to_process': to_process, 'to_keep': to_keep, 'total_cnt': len(list(set(to_process) - set(to_keep))), 'processed_cnt': 0, 'total_size': 0, 'user': user, 'user_comment': user_comment, 'lockfile': '', 'msg_banner': msg_banner, } ) return list_of_file_dict
def _process_fileset_task(dmfilestat, action, user, user_comment, lockfile, msg_banner): ''' This function generates a list of files to process, then hands the list to a recursive celery task function. The recursion continues until the list is empty. The calling function exits immediately. ''' logid = {'logid':"%s" % (lockfile)} logger.debug("Function: %s()" % sys._getframe().f_code.co_name, extra = logid) if dmfilestat.isdeleted(): errmsg = "The %s for %s are deleted" % (dmfilestat.dmfileset.type, dmfilestat.result.resultsName) logger.warn(errmsg, extra = logid) raise Exception(errmsg) elif dmfilestat.isarchived(): if not os.path.exists(dmfilestat.archivepath): errmsg = "Cannot access backup location %s" % dmfilestat.archivepath logger.warn(errmsg, extra = logid) raise Exception(errmsg) else: # search archived directory search_dirs = [dmfilestat.archivepath] else: # search both results directory and raw data directory search_dirs = [dmfilestat.result.get_report_dir(), dmfilestat.result.experiment.expDir] # Create a lock file here to block any other actions on this report (see TS-8411) lock_id = "%s_%s" % (dmfilestat.result.resultsName, dm_utils.slugify(dmfilestat.dmfileset.type)) locallock = TaskLock(lock_id, timeout=60) # short timeout in case lock release code doesn't get called if not(locallock.lock()): logger.warn("lock file exists: %s(%s)" % (lock_id, locallock.get()), extra = logid) # Release the task lock try: applock = TaskLock(lockfile) applock.unlock() except: logger.error(traceback.format_exc(), extra = logid) return logger.info("lock file created: %s(%s)" % (lock_id, locallock.get()), extra = logid) if action == ARCHIVE: dmfilestat.setactionstate('AG') elif action == DELETE: dmfilestat.setactionstate('DG') elif action == EXPORT: dmfilestat.setactionstate('EG') # List of all files associated with the report cached_file_list = dm_utils.get_walk_filelist(search_dirs, list_dir=dmfilestat.result.get_report_dir()) #Determine if this file type is eligible to use a keep list kpatterns = _get_keeper_list(dmfilestat, action) #Create a list of files eligible to process list_of_file_dict = [] is_thumbnail = dmfilestat.result.isThumbnail add_linked_sigproc = False if (action == DELETE or dmfilestat.dmfileset.type == dmactions_types.INTR) else True for start_dir in search_dirs: logger.debug("Searching: %s" % start_dir, extra = logid) to_process = [] to_keep = [] if os.path.isdir(start_dir): to_process, to_keep = dm_utils._file_selector(start_dir, dmfilestat.dmfileset.include, dmfilestat.dmfileset.exclude, kpatterns, is_thumbnail, add_linked_sigproc, cached=cached_file_list) logger.info("%d files to process at %s" % (len(list(set(to_process) - set(to_keep))), start_dir), extra = logid) list_of_file_dict.append( { 'pk':dmfilestat.id, 'action':action, 'archivepath':dmfilestat.archivepath, 'start_dir':start_dir, 'to_process':to_process, 'to_keep':to_keep, 'total_cnt':len(list(set(to_process) - set(to_keep))), 'processed_cnt':0, 'total_size':0, 'user':user, 'user_comment':user_comment, 'lockfile':lockfile, 'msg_banner':msg_banner, } ) try: pfilename = set_action_param_var(list_of_file_dict) # Call the recursive celery task function to process the list _process_task.delay(pfilename) except: logger.error("We got an error here, _process_fileset_task", extra = logid) raise finally: if locallock: locallock.unlock() return
def process_import(importing, copy_data, copy_report): # create DB records try: objs = load_serialized_json( importing.json_path, importing.createResult, importing.log, importing.add_warning, ) result = objs.get("results", None) exp = objs["experiment"] importing.update_destinations(result, exp) except Exception as e: raise if result: dmfilestats_to_import = result.dmfilestat_set.filter( dmfileset__type__in=importing.dmtypes) # check if importing is allowed for dmfilestat in dmfilestats_to_import: if dmfilestat.action_state in ["AG", "DG", "EG", "SA", "SE", "SD"]: raise Exception( "Cannot import %s when data is in process: %s" % (dmfilestat.dmfileset.type, dmfilestat.get_action_state_display())) # set status dmfilestats_to_import.update(action_state="IG") result.status = "Importing" result.save() EventLog.objects.add_entry( result, "Importing %s %s." % (importing.name, importing.selected_str), importing.user, ) # get list of files file_list = [] if copy_data or copy_report: source_paths = set( [c["src_path"] for c in importing.categories if c["copy_files"]]) file_list = get_walk_filelist(list(source_paths), list_dir=False, save_list=False) # calculate dmfilestat diskspace importing.update_diskspace(file_list) importing.log("Selected categories:" + json.dumps(importing.categories, indent=1)) # destination validation try: validate_destination(importing.categories) except Exception: raise # copy files to destination for category in importing.categories: dmtype = category["dmtype"] source_dir = category["src_path"] destination = category["dest_path"] if result: dmfilestat = result.get_filestat(dmtype) dmfileset = dmfilestat.dmfileset else: dmfilestat = None dmfileset = DMFileSet.objects.get(version=RELVERSION, type=dmtype) # process files if category["copy_files"]: importing.log("Start processing files for %s." % dmtype) if not os.path.exists(source_dir): raise Exception("Source path %s does not exist, exiting." % source_dir) try: copy_files_to_destination( source_dir, destination, dmfileset, file_list, importing.log, importing.add_warning, ) except Exception: raise # for OIA results need sigproc_results link if dmtype == dmactions_types.BASE and os.path.exists( os.path.join(destination, "onboard_results")): if result: sigproc_results_link = os.path.join( result.get_report_dir(), "sigproc_results") else: sigproc_results_link = os.path.join( destination, "sigproc_results") if not os.path.exists(sigproc_results_link): os.symlink( os.path.join(destination, "onboard_results", "sigproc_results"), sigproc_results_link, ) elif dmtype == dmactions_types.OUT: # special case: importing Report as Archived (copy_report=False) try: generate_report_pdf(source_dir, result, dmfilestat, importing.log, importing.add_warning) except Exception: importing.add_warning("Failed to generate report pdf") importing.log(traceback.format_exc()) # update database objects; DM state is Local if files copied, otherwise Archived importing.log("Updating location of imported files") if result: for category in importing.categories: dmfilestat = result.get_filestat(category["dmtype"]) if category["copy_files"]: dmfilestat.action_state = "L" dmfilestat.created = timezone.now() else: # data files left on media, need to update dmfilestat to archived location dmfilestat.action_state = "AD" dmfilestat.archivepath = category["src_path"] dmfilestat.diskspace = category["diskspace"] dmfilestat.save() result.status = "Completed" result.save() # update all diskusage related database entries try: update_dmfilestat_diskusage(result.pk) except Exception: importing.log("Error updating diskusage") logger.error(traceback.format_exc()) elif dmactions_types.SIG in importing.dmtypes: if copy_data: # if any results exist for this data set, need to update their dmfilestats DMFileStat.objects.filter( dmfileset__type=dmactions_types.SIG, result__experiment=exp).update(action_state="L") else: # only Sigproc imported (no dmfilestats) and data files not copied exp.expDir = os.path.dirname(importing.json_path) exp.save()