def check_days(site, days, config): """Check if 'days' given while running command. If not take the default threshold from config file (which should exist). Also when 'days' given on the command line raise a check to make sure it was really meant to do so :param str site: site to be cleaned and relevent date to pick :param int days: number of days to check, will be None if '-d' not used :param dict config: config file parsed and saved as dictionary """ try: default_days = config["cleanup"][site]["days"] except KeyError: raise if not days: return default_days elif days >= default_days: return days else: if misc.query_yes_no( "Seems like given days({}) is less than the " " default({}), are you sure to proceed ?".format(days, default_days), default="no", ): return days else: return None
def check_default(site, seconds, config): """Check if time(as seconds) given while running command. If not take the default threshold from config file (which should exist). Also when 'days' given on the command line raise a check to make sure it was really meant to do so :param str site: site to be cleaned and relevent date to pick :param int seconds: Days/hours converted as seconds to check :param dict config: config file parsed and saved as dictionary """ try: default_days = config['cleanup']['milou'][site]['days'] default_seconds = misc.to_seconds(days=default_days) except KeyError: raise if not seconds: return default_seconds elif seconds >= default_seconds: return seconds else: if misc.query_yes_no( "Seems like given time is less than the " " default({}) days, are you sure to proceed ?".format( default_days), default="no"): return seconds else: return None
def check_default(site, seconds, config): """Check if time(as seconds) given while running command. If not take the default threshold from config file (which should exist). Also when 'days' given on the command line raise a check to make sure it was really meant to do so :param str site: site to be cleaned and relevent date to pick :param int seconds: Days/hours converted as seconds to check :param dict config: config file parsed and saved as dictionary """ try: default_days = config['cleanup']['milou'][site]['days'] default_seconds = misc.to_seconds(days=default_days) except KeyError: raise if not seconds: return default_seconds elif seconds >= default_seconds: return seconds else: if misc.query_yes_no("Seems like given time is less than the " " default({}) days, are you sure to proceed ?" .format(default_days), default="no"): return seconds else: return None
def check_days(site, days, config): """Check if 'days' given while running command. If not take the default threshold from config file (which should exist). Also when 'days' given on the command line raise a check to make sure it was really meant to do so :param str site: site to be cleaned and relevent date to pick :param int days: number of days to check, will be None if '-d' not used :param dict config: config file parsed and saved as dictionary """ try: default_days = config['cleanup'][site]['days'] except KeyError: raise if not days: return default_days elif days >= default_days: return days else: if misc.query_yes_no("Seems like given days({}) is less than the " " default({}), are you sure to proceed ?" .format(days,default_days), default="no"): return days else: return None
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing undet_file_pattern: "Undetermined_*.fastq.gz" ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] flowcell_undet_files = config['flowcell']['undet_file_pattern'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] if date: date = datetime.strptime(date, '%Y-%m-%d') except KeyError as e: logger.error( "Config file is missing the key {}, make sure it have all required information" .format(str(e))) raise SystemExit except ValueError as e: logger.error( "Date given with '--date' option is not in required format, see help for more info" ) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): with open(exclude_projects, 'r') as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: exclude_list.extend(exclude_projects.split(',')) # sanity check for mentioned project to exculde or valid invalid_projects = filter( lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view. keys(), exclude_list) if invalid_projects: logger.error( "'--exclude_projects' was called with some invalid projects '{}', " "provide valid project name/id".format( ",".join(invalid_projects))) raise SystemExit #compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: logger.info("Building initial project list for removing data..") if only_fastq: logger.info( "Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info( "Option 'only_analysis' is given, so will not look for fastq data") if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: for fc in [ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn( "Flowcell {} do not contain a '{}' direcotry". format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue fc_undet_files = glob( os.path.join(flowcell_project_source, flowcell_undet_files)) if fc_undet_files: logger.info( "All projects was cleaned for FC {}, found {} undeterminded files" .format(fc, len(fc_undet_files))) all_undet_files.extend( map(os.path.abspath, fc_undet_files)) if all_undet_files: undet_size = _def_get_size_unit( sum(map(os.path.getsize, all_undet_files))) if misc.query_yes_no( "In total found {} undetermined files which are {} in size, delete now ?" .format(len(all_undet_files), undet_size), default="no"): removed = _remove_files(all_undet_files) return elif only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info( pid, pcon.get_entry(pid, use_id_view=True), date) if proj_info and proj_info['closed_days'] >= days_analysis: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info[ 'pid'] in exclude_list: continue analysis_data, analysis_size = collect_analysis_data_irma( pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn( "Flowcell {} do not contain a '{}' direcotry". format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[ proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma( fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove'][ 'flowcells'][fc] = fc_fq_files[ 'flowcells'][fc] project_clean_list[proj][ 'fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info( proj, pcon.get_entry(proj), date) if proj_info: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info[ 'pid'] in exclude_list: continue # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma( fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma( proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ( (not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True): print "\t".join([ p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], str(p_info['closed_days']), p_info['closed_date'], _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size']) ]) raise SystemExit logger.info("Initial list is built with {} projects {}".format( len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no( "{}Delete files for this project ({}/{})".format( get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info( "Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format( len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format( len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info( "Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info( "Removed fastq files from FC {} for project {}, marking it as cleaned" .format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info( "All flowcells cleaned for this project, marking it as cleaned in {}" .format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format( qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn( "Couldn't remove some files in qc directory '{}'". format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info( "Removed analysis data for project {}, marking it cleaned". format(proj)) _touch_cleaned(proj_analysis_root)
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing undet_file_pattern: "Undetermined_*.fastq.gz" ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] flowcell_undet_files = config['flowcell']['undet_file_pattern'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] if date: date = datetime.strptime(date, '%Y-%m-%d') except KeyError as e: logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e))) raise SystemExit except ValueError as e: logger.error("Date given with '--date' option is not in required format, see help for more info") raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): with open(exclude_projects, 'r') as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: exclude_list.extend(exclude_projects.split(',')) # sanity check for mentioned project to exculde or valid invalid_projects = filter(lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.keys(), exclude_list) if invalid_projects: logger.error("'--exclude_projects' was called with some invalid projects '{}', " "provide valid project name/id".format(",".join(invalid_projects))) raise SystemExit #compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: logger.info("Building initial project list for removing data..") if only_fastq: logger.info("Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info("Option 'only_analysis' is given, so will not look for fastq data") if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue fc_undet_files = glob(os.path.join(flowcell_project_source,flowcell_undet_files)) if fc_undet_files: logger.info("All projects was cleaned for FC {}, found {} undeterminded files".format(fc,len(fc_undet_files))) all_undet_files.extend(map(os.path.abspath, fc_undet_files)) if all_undet_files: undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) if misc.query_yes_no("In total found {} undetermined files which are {} in size, delete now ?".format(len(all_undet_files), undet_size), default="no"): removed = _remove_files(all_undet_files) return elif only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date) if proj_info and proj_info['closed_days'] >= days_analysis: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: continue analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] project_clean_list[proj]['fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date) if proj_info: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: continue # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True): print "\t".join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], str(p_info['closed_days']), p_info['closed_date'], _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])]) raise SystemExit logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info("Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info("Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn("Couldn't remove some files in qc directory '{}'".format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info("Removed analysis data for project {}, marking it cleaned".format(proj)) _touch_cleaned(proj_analysis_root)
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, status_db_config, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] except KeyError as e: logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e))) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" #compile list for project to delete project_clean_list, project_processed_list = ({}, []) logger.info("Building initial project list for removing data..") if only_fastq: logger.info("Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info("Option 'only_analysis' is given, so will not look for fastq data") if only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True)) if proj_info and proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] project_clean_list[proj]['fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info(proj, pcon.get_entry(proj)) if proj_info: # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return get_files_size_text(project_clean_list) logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info("Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info("Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn("Couldn't remove some files in qc directory '{}'".format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info("Removed analysis data for project {}, marking it cleaned".format(proj)) _touch_cleaned(proj_analysis_root)
def test_query_yes_no_false(self, mock_raw_input): """Return False from answer no.""" response = misc.query_yes_no('Some question') self.assertFalse(response)
def test_query_yes_no_true(self, mock_raw_input): """Return True from answer yes.""" response = misc.query_yes_no('Some question') self.assertTrue(response)