def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn)
def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn)
def check_and_make_dir(self, dir_name): # if not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError: if os.path.isdir(dir_name): logger.warning("\nDirectory %s already exists." % (dir_name)) # confirm_msg = "Do you want to continue? (Yes / No) " # answer = raw_input(confirm_msg) # if answer != 'Yes': # sys.exit("There was an error in the directory " + dir_name + " creation - Exiting.") # elif answer == 'Yes': pass else: # There was an error on creation, so make sure we know about it raise return dir_name
def get_input_files(self): files_list = [] print self.general_config_dict['input_dir'] if os.path.isdir(self.general_config_dict['input_dir']): for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ): if os.path.isdir(infile) == True: pass else: files_list.append(os.path.basename(infile)) else: if fasta_file: pass logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir']) return files_list
def check_projects_and_datasets(self,data): self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print p my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print "\t%s" % (d) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn)
def check_projects_and_datasets(self, data): self.get_my_conn() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print(p) my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print("\t%s" % (d)) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn)
def gather_files_per_key(self, key): file_collector={} out_gast_dir = os.path.join(self.global_gast_dir,key) #directory file_collector['gast_concat_file'] = os.path.join(out_gast_dir,'gast_concat') file_collector['tagtax_file'] = os.path.join(out_gast_dir,'tagtax_terse') if not os.path.exists(file_collector['gast_concat_file']): logger.warning("Could not find gast_concat_file file: "+file_collector['gast_concat_file']) if not os.path.exists(file_collector['tagtax_file']): logger.warning("Could not find tagtax_file file: "+file_collector['tagtax_file']) #print key,self.runobj.platform if self.runobj.vamps_user_upload: file_collector['unique_file'] = os.path.join(out_gast_dir,'unique.fa') file_collector['original_fa_file'] = os.path.join(out_gast_dir,'fasta.fa') if self.runobj.fasta_file: grep_cmd = ['grep','-c','>',self.runobj.fasta_file] else: grep_cmd = ['grep','-c','>',file_collector['unique_file']] else: if self.runobj.platform == 'illumina': #unique_file = os.path.join(self.basedir,C.gast_dir),'unique.fa') reads_dir = dirs.check_dir(dirs.reads_overlap_dir) file_prefix = self.runobj.samples[key].file_prefix file_collector['unique_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique") # ANNA What is the correct file here: file_collector['original_fa_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique") grep_cmd = ['grep','-c','>',file_collector['unique_file']] elif self.runobj.platform == '454': pass else: sys.exit("no usable platform found") if not os.path.exists(file_collector['unique_file']): logger.error("Could not find unique_file: "+file_collector['unique_file']) # get dataset_count here from unique_file # the dataset_count should be from the non-unique file # but if we don't have that must use uniques try: dataset_count = subprocess.check_output(grep_cmd).strip() except: dataset_count = 0 print key,": Sequence Count", dataset_count # output files to be created: file_collector['taxes_file'] = os.path.join(out_gast_dir,'vamps_data_cube_uploads.txt') file_collector['summed_taxes_file'] = os.path.join(out_gast_dir,'vamps_junk_data_cube_pipe.txt') file_collector['distinct_taxes_file'] = os.path.join(out_gast_dir,'vamps_taxonomy_pipe.txt') file_collector['sequences_file'] = os.path.join(out_gast_dir,'vamps_sequences_pipe.txt') file_collector['export_file'] = os.path.join(out_gast_dir,'vamps_export_pipe.txt') file_collector['projects_datasets_file'] = os.path.join(out_gast_dir,'vamps_projects_datasets_pipe.txt') file_collector['project_info_file'] = os.path.join(out_gast_dir,'vamps_projects_info_pipe.txt') return (file_collector, dataset_count, out_gast_dir)
def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print "Validating ini type Config File (may have been converted from csv)" new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print "New ini file location: "+new_ini_file return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print 'configpath',self.general_config_dict['configPath'] # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print self.data_object['input_dir'] #print self.data_object['input_files'] if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina': file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print "\033[92mCSV File Passed Vaidation! (with warnings)\033[0m" else: print "\033[92mCSV File Passed Vaidation!\033[0m" return msg
def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print("Validating ini type Config File (may have been converted from csv)") new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print("New ini file location: "+new_ini_file) return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print('configpath',self.general_config_dict['configPath']) # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print(self.data_object['input_dir']) #print(self.data_object['input_files']) if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list: file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m") else: print("\033[92mCSV File Passed Vaidation!\033[0m") return msg