def get_fasta_file_names(self): fa_files = [] pipelne_utils = PipelneUtils() files = pipelne_utils.get_all_files(self.fasta_dir) for full_name in files.keys(): if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == "FILTERED")): fa_files.append(full_name) return fa_files
def get_fasta_file_names(self): fa_files = [] pipelne_utils = PipelneUtils() files = pipelne_utils.get_all_files(self.in_file_path) for full_name in files.keys(): if (files[full_name][1] == ".unique") and (files[full_name][0].split(".")[-1].strip() == "fa"): print full_name fa_files.append(full_name) return fa_files
def __init__(self, command_line_args = None, configuration_dictionary = None): self.args = command_line_args self.general_config_dict = configuration_dictionary self.known_header_list = C.csv_header_list self.pipeline_run_items = C.pipeline_run_items self.primer_suites = self.convert_primer_suites(C.primer_suites) self.dna_regions = C.dna_regions self.data_object = {} self.data_object['general'] = {} self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct then press 'c' to continue the pipeline\n""" self.res_headers = [] self.env = {} self.utils = PipelneUtils()
def __init__(self, host="bpcweb7", db="test"): # , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306 self.utils = PipelneUtils() self.conn = None self.cursor = None self.rows = 0 self.new_id = None self.lastrowid = None try: self.utils.print_both("=" * 40) self.utils.print_both("host = " + str(host) + ", db = " + str(db)) self.utils.print_both("=" * 40) read_default_file = os.path.expanduser("~/.my.cnf") port_env = 3306 if self.utils.is_local(): host = "127.0.0.1" if db == "env454": port_env = 3308 read_default_file = os.path.expanduser("~/.my.cnf_server") else: db = "test_env454" self.conn = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env) self.cursor = self.conn.cursor() # self.escape = self.conn.escape() except MySQLdb.Error, e: self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1])) raise
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def get_fasta_file_names(self): fa_files = [] pipelne_utils = PipelneUtils() files = pipelne_utils.get_all_files(self.fasta_dir) for full_name in files.keys(): # if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == C.filtered_suffix)): if (full_name.endswith(self.nonchimeric_suffix)): fa_files.append(full_name) print full_name self.suffix_used = self.nonchimeric_suffix next elif (full_name.endswith(self.fa_unique_suffix)): fa_files.append(full_name) print full_name self.suffix_used = self.fa_unique_suffix return fa_files
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 self.unique_fasta_files = [] # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.fasta_dir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() # self.my_conn = MyConnection(host = 'localhost', db="test_env454") self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = self.dirs.unique_file_counts self.dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique self.v6_unique_suffix = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix] # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = ""
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] ='/opt/sge' os.environ['SGE_CELL'] ='grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:'+path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) self.platform = self.runobj.platform
class MyConnection: """ Connection to env454 Takes parameters from ~/.my.cnf, default host = "vampsdev", db="test" if different use my_conn = MyConnection(host, db) """ def __init__(self, host="bpcweb7", db="test"): # , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306 self.utils = PipelneUtils() self.conn = None self.cursor = None self.rows = 0 self.new_id = None self.lastrowid = None try: self.utils.print_both("=" * 40) self.utils.print_both("host = " + str(host) + ", db = " + str(db)) self.utils.print_both("=" * 40) read_default_file = os.path.expanduser("~/.my.cnf") port_env = 3306 if self.utils.is_local(): host = "127.0.0.1" if db == "env454": port_env = 3308 read_default_file = os.path.expanduser("~/.my.cnf_server") else: db = "test_env454" self.conn = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env) self.cursor = self.conn.cursor() # self.escape = self.conn.escape() except MySQLdb.Error, e: self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1])) raise except: # catch everything
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir)
from pipeline.get_ini import readCSV from pipeline.pipelinelogging import logger from pipeline.utils import Dirs, PipelneUtils import IlluminaUtils.lib.fastalib as fastalib try: import MySQLdb except MySQLdb.Error, e: message = """ MySQLdb ERROR To load the correct module, try running these commands before running the pipeline: source /xraid/bioware/Modules/etc/profile.modules module load bioware """ PipelneUtils.print_both(message) PipelneUtils.print_both("Error %d: %s" % (e.args[0], e.args[1])) raise except: # catch everything PipelneUtils.print_both("Unexpected:") # print "Unexpected:" # handle unexpected exceptions PipelneUtils.print_both(sys.exc_info()[0]) # print sys.exc_info()[0] # info about curr exception (type,value,traceback) raise # sys.exit(""" # MySQLdb ERROR # To load the correct module, try running these commands before running the pipeline: # # source /xraid/bioware/Modules/etc/profile.modules # module load bioware
class IlluminaFiles: """ 0) from run create all dataset_lines names files in output dir 1) split fastq files from casava into files with dataset_names 2) create ini files 3) process them through Meren's script 4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload() """ def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) def split_files(self, compressed = False): """ TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?) """ # print "compressed = %s" % compressed # compressed = ast.literal_eval(compressed) (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path) # correct_file_names = self.get_correct_file_names(in_files_r1) if (len(in_files_r1) > 0): self.read1(in_files_r1, compressed) self.read2(in_files_r2, compressed) self.create_inis() else: # print "ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes." # logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.") self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.") self.close_dataset_files() # self.perfect_reads() # self.uniq_fa() def open_dataset_files(self): file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()] for f_name in file_name_base: output_file = os.path.join(self.out_file_path, f_name + ".fastq") self.out_files[f_name] = fq.FastQOutput(output_file) self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq")) def close_dataset_files(self): [o_file[1].close() for o_file in self.out_files.iteritems()] return # def perfect_reads(self): # self.utils.print_both("Extract perfect V6 reads:") # for idx_key in self.runobj.samples.keys(): # file_name = os.path.join(self.out_file_path, idx_key + ".ini") # program_name = C.perfect_overlap_cmd # if self.utils.is_local(): # program_name = C.perfect_overlap_cmd_local # try: # if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'): # call([program_name, file_name, "--archaea"]) # else: # call([program_name, file_name]) # except: # self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name)) # raise # def call_sh_script(self, script_name_w_path, where_to_run): try: call(['chmod', '0774', script_name_w_path]) if self.utils.is_local(): self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))") call(['bash', script_name_w_path], cwd=(where_to_run)) else: call(['qsub', script_name_w_path], cwd=(where_to_run)) # pass except: self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path)) raise # todo: combine and DRY with partial - it's the same command, different arguments def merge_perfect(self): self.utils.print_both("merge perfect V6 reads:") program_name = C.perfect_overlap_cmd if self.utils.is_local(): program_name = C.perfect_overlap_cmd_local add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0" command_line = program_name + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) return script_file_name def trim_primers_perfect(self): self.utils.print_both("trim primers from perfect V6 reads:") merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED") primer_suite = self.get_config_values('primer_suite') add_arg = "" if any([s.lower().startswith("Archaeal".lower()) for s in primer_suite]): add_arg += " --archaea" program_name = C.trim_primers_cmd + add_arg script_file_name = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names) script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir) return script_file_name """ def perfect_reads_cluster(self): ''' iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0 Each flag is critical. marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps. This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers. Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script: iu-trim-V6-primers test_MERGED ''' self.utils.print_both("Extract perfect V6 reads:") script_file_name = self.merge_perfect() trim_script_file_name = self.trim_primers_perfect() return (script_file_name, trim_script_file_name) """ def partial_overlap_reads_cluster(self): self.utils.print_both("Extract partial_overlap V4V5 reads:") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local dna_region = self.get_config_values('dna_region') if ("ITS1" in list(dna_region)): add_arg = "--marker-gene-stringent" else: add_arg = "" # TODO: this part is the same in perfect overlap - move into a method command_line = program_name + " --enforce-Q30-check " + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir)) self.dirs.chmod_all(self.dirs.analysis_dir) return script_file_name def partial_overlap_reads(self): self.utils.print_both("Extract partial_overlap V4V5 reads:") for idx_key in self.runobj.samples.keys(): ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local try: if (self.runobj.samples[idx_key].dna_region == "ITS1"): call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name]) else: call([program_name, "--enforce-Q30-check", ini_file_name]) # call([program_name, ini_file_name]) # call([program_name, ini_file_name, idx_key]) # call([program_name, "--fast-merge", ini_file_name, idx_key]) except Exception: # except Exception, err: message = traceback.format_exc() self.utils.print_both(message) #or # print sys.exc_info()[0] self.utils.print_both("Problems with program_name = %s" % (program_name)) raise # print "HERE: program_name = " % (program_name) # call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key]) def get_config_values(self, key): config_path_data = [v for k, v in self.runobj.configPath.items()] return set([a[key] for a in config_path_data if key in a.keys()]) def make_users_email(self): username = getpass.getuser() return username + "@mbl.edu" def create_job_array_script(self, command_line, dir_to_run, files_list): files_string = " ".join(files_list) files_list_size = len(files_list) command_file_name = os.path.basename(command_line.split(" ")[0]) script_file_name = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) log_file_name = script_file_name + ".sge_script.sh.log" email_mbl = self.make_users_email() text = ( '''#!/bin/bash #$ -cwd #$ -S /bin/bash #$ -N %s # Giving the name of the output log file #$ -o %s # Combining output/error messages into one file #$ -j y # Send mail to these users #$ -M %s # Send mail at job end; -m eas sends on end, abort, suspend. #$ -m eas #$ -t 1-%s # Now the script will iterate %s times. file_list=(%s) i=$(expr $SGE_TASK_ID - 1) # echo "i = $i" # . /etc/profile.d/modules.sh # . /xraid/bioware/bioware-loader.sh . /xraid/bioware/Modules/etc/profile.modules module load bioware echo "%s ${file_list[$i]}" %s ${file_list[$i]} ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.open_write_close(script_file_name_full, text) return script_file_name def filter_mismatches_cluster(self, max_mismatch = 3): self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch)) command_line = C.filter_mismatch_cmd if self.utils.is_local(): command_line = C.filter_mismatch_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, "_MERGED") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.call_sh_script(script_file_name_full, files_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir)) self.dirs.chmod_all(files_dir) return script_file_name def filter_mismatches(self, max_mismatch = 3): self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch)) n = 0 files = self.dirs.get_all_files() for full_name in files.keys(): if files[full_name][0].endswith('_MERGED'): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.filter_mismatch_cmd if self.utils.is_local(): program_name = C.filter_mismatch_cmd_local # output_flag = "--output " + full_name + "_FILTERED" # TODO: Remove!!! # output_flag = "-o " + full_name + "_FILTERED" # output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED" # print "output_flag = %s" % (output_flag) # print "%s %s %s" % (program_name, full_name, output_flag) # call([program_name, full_name, output_flag]) call([program_name, full_name]) def uniq_fa_cluster(self): self.utils.print_both("Uniqueing fasta files") command_line = C.fastaunique_cmd if self.utils.is_local(): command_line = C.fastaunique_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix) if len(file_list) == 0: file_list = self.dirs.get_all_files_by_ext(files_dir, ".fa") if len(file_list) == 0: file_list = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.call_sh_script(script_file_name_full, files_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir)) self.dirs.chmod_all(files_dir) return script_file_name def uniq_fa(self): n = 0 self.utils.print_both("Uniqueing fasta files") files = self.dirs.get_all_files() for full_name in files.keys(): # if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'): if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.fastaunique_cmd if self.utils.is_local(): program_name = C.fastaunique_cmd_local call([program_name, full_name]) def get_primers(self): proximal_primer = "" distal_primer = "" primers = {} for idx_key in self.runobj.samples.keys(): primer_suite = self.runobj.samples[idx_key].primer_suite.lower() if primer_suite in C.primers_dict: proximal_primer = C.primers_dict[primer_suite]["proximal_primer"] distal_primer = C.primers_dict[primer_suite]["distal_primer"] # print "proximal_primer: %s. distal_primer: %s" % (proximal_primer, distal_primer) else: self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite)) primers[idx_key] = (proximal_primer, distal_primer) return primers def create_inis(self): for idx_key in self.runobj.samples.keys(): run_key = idx_key.split('_')[1].replace("N", "."); "todo: check if works w/o NNNN when there is a proper csv" email = self.runobj.samples[idx_key].email # for dataset in self.dataset_emails.keys(): # dataset_idx_base = dataset + "_" + self.dataset_index[dataset] # print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset]) text = """[general] project_name = %s researcher_email = %s input_directory = %s output_directory = %s [files] pair_1 = %s pair_2 = %s """ % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq") "That's for parital overlap (v4v5 miseq illumina)" if not self.runobj.do_perfect: primers = self.get_primers() text += """ # following section is optional [prefixes] pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1] ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") self.open_write_close(ini_file_name, text) def open_write_close(self, script_file_name, text): ini_file = open(script_file_name, "w") ini_file.write(text) ini_file.close() def get_fastq_file_names(self, f_input_file_path): in_files_r1 = [] in_files_r2 = [] "TODO: exclude dir with new created files from the loop" for dirname, dirnames, filenames in os.walk(f_input_file_path): correct_file_names = self.get_correct_file_names(filenames) for filename in sorted(list(correct_file_names)): if filename.find('_R1_') > 0: in_files_r1.append(os.path.join(dirname, filename)) elif filename.find('_R2_') > 0: in_files_r2.append(os.path.join(dirname, filename)) else: sys.stderr.write("No read number in the file name: %s\n" % filename) self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2)) return (in_files_r1, in_files_r2) def get_correct_file_names(self, filenames): correct_file_names = []; for file1 in filenames: index_sequence = self.get_index(file1) # self.runobj.run_keys # good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)] if len(good_run_key_lane_names) > 0: correct_file_names.append(file1) return set(correct_file_names) def read1(self, files_r1, compressed): """ loop through the fastq_file_names 1) e.pair_no = 1, find run_key -> dataset name 2) collect the relevant part of id """ for file_r1 in files_r1: self.utils.print_both("====\nFFF1: file %s" % file_r1) f_input = fq.FastQSource(file_r1, compressed) index_sequence = self.get_index(file_r1) while f_input.next(trim_to = C.trimming_length): e = f_input.entry # todo: a fork with or without NNNN, add an argument # ini_run_key = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number has_ns = any("NNNN" in s for s in self.runobj.run_keys) # has_ns = True ini_run_key = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + e.lane_number if int(e.pair_no) == 1: dataset_file_name_base_r1 = ini_run_key + "_R1" if (dataset_file_name_base_r1 in self.out_files.keys()): self.out_files[dataset_file_name_base_r1].store_entry(e) "TODO: make a method:" short_id1 = e.header_line.split()[0] short_id2 = ":".join(e.header_line.split()[1].split(":")[1:]) id2 = short_id1 + " 2:" + short_id2 self.id_dataset_idx[id2] = ini_run_key else: self.out_files["unknown"].store_entry(e) # def truncate_seq(self, seq): # return seq[:C.trimming_length] def get_run_key(self, e_sequence, has_ns = "True"): if has_ns: return ("NNNN" + e_sequence[4:9]) else: return e_sequence[0:5] def remove_end_ns_strip(self, e_sequence): if e_sequence.endswith('N'): return e_sequence.rstrip('N') else: return e_sequence def read2(self, files_r2, compressed): "3) e.pair_no = 2, find id from 2), assign dataset_name" for file_r2 in files_r2: self.utils.print_both("FFF2: file %s" % file_r2) f_input = fq.FastQSource(file_r2, compressed) while f_input.next(trim_to = C.trimming_length): e = f_input.entry # start = time.time() # time_before = self.utils.get_time_now() # e.sequence = self.remove_end_ns_strip(e.sequence) # elapsed = (time.time() - start) # print "remove_end_ns_strip with strip is done in: %s" % (elapsed) if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx): file_name = self.id_dataset_idx[e.header_line] + "_R2" self.out_files[file_name].store_entry(e) else: self.out_files["unknown"].store_entry(e) def get_index(self, file_r1): file_name_parts = os.path.basename(file_r1).split("_") # if the file name starts with "IDX, then actual idx will be next. index = file_name_parts[0] if file_name_parts[0].startswith("IDX"): index = file_name_parts[1] return index
class IlluminaFiles: """ 0) from run create all dataset_lines names files in output dir 1) split fastq files from casava into files with dataset_names 2) create ini files 3) process them through Meren's script 4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload() """ def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) def split_files(self, compressed = False): """ TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?) """ # print "compressed = %s" % compressed # compressed = ast.literal_eval(compressed) (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path) self.read1(in_files_r1, compressed) self.read2(in_files_r2, compressed) self.create_inis() self.close_dataset_files() # self.perfect_reads() # self.uniq_fa() def open_dataset_files(self): file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()] for f_name in file_name_base: output_file = os.path.join(self.out_file_path, f_name + ".fastq") self.out_files[f_name] = fq.FastQOutput(output_file) self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq")) def close_dataset_files(self): [o_file[1].close() for o_file in self.out_files.iteritems()] return def get_all_files(self): files = {} for dirname, dirnames, filenames in os.walk(self.out_file_path): for file_name in filenames: full_name = os.path.join(dirname, file_name) (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name)) files[full_name] = (file_base, file_extension) # print "len(files) = %s" % len(files) return files def perfect_reads(self): print "Extract perfect V6 reads:" for idx_key in self.runobj.samples.keys(): file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.perfect_overlap_cmd if self.utils.is_local(): program_name = C.perfect_overlap_cmd_local try: if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'): call([program_name, file_name, "--archaea"]) else: call([program_name, file_name]) except: print "Problems with program_name = %s, file_name = %s" % (program_name, file_name) raise def partial_overlap_reads(self): print "Extract partial_overlap V4V5 reads:" for idx_key in self.runobj.samples.keys(): ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key]) def filter_mismatches(self, max_mismatch = 3): print "Filter mismatches if more then %s" % (max_mismatch) n = 0 files = self.get_all_files() for full_name in files.keys(): if files[full_name][0].endswith('_MERGED'): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.filter_mismatch_cmd if self.utils.is_local(): program_name = C.filter_mismatch_cmd_local output_flag = "--output " + full_name + "_FILTERED" call([program_name, full_name, output_flag]) def uniq_fa(self): n = 0 print "Uniqueing fasta files" files = self.get_all_files() for full_name in files.keys(): if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.fastaunique_cmd if self.utils.is_local(): program_name = C.fastaunique_cmd_local call([program_name, full_name]) def create_inis(self): for idx_key in self.runobj.samples.keys(): run_key = idx_key.split('_')[1].replace("N", "."); email = self.runobj.samples[idx_key].email # for dataset in self.dataset_emails.keys(): # dataset_idx_base = dataset + "_" + self.dataset_index[dataset] # print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset]) text = """[general] project_name = %s researcher_email = %s input_directory = %s output_directory = %s [files] pair_1 = %s pair_2 = %s """ % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq") "That's for v4v5 miseq illumina" if not self.runobj.do_perfect: text += """ # following section is optional [prefixes] pair_1_prefix = ^""" + run_key + """CCAGCAGC[C,T]GCGGTAA. pair_2_prefix = ^CCGTC[A,T]ATT[C,T].TTT[G,A]A.T """ ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") self.open_write_close(ini_file_name, text) def open_write_close(self, ini_file_name, text): ini_file = open(ini_file_name, "w") ini_file.write(text) ini_file.close() def get_fastq_file_names(self, f_input_file_path): in_files_r1 = [] in_files_r2 = [] "TODO: exclude dir with new created files from the loop" for dirname, dirnames, filenames in os.walk(f_input_file_path): for filename in filenames: if filename.find('_R1_') > 0: in_files_r1.append(os.path.join(dirname, filename)) elif filename.find('_R2_') > 0: in_files_r2.append(os.path.join(dirname, filename)) else: sys.stderr.write("No read number in the file name: %s\n" % filename) return (in_files_r1, in_files_r2) def read1(self, files_r1, compressed): """ loop through the fastq_file_names 1) e.pair_no = 1, find run_key -> dataset name 2) collect the relevant part of id """ for file_r1 in files_r1: print "FFF1: file %s" % file_r1 index_sequence = self.get_index(file_r1) f_input = fq.FastQSource(file_r1, compressed) while f_input.next(): e = f_input.entry ini_run_key = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number # ini_run_key = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1: dataset_file_name_base_r1 = ini_run_key + "_R1" if (dataset_file_name_base_r1 in self.out_files.keys()): self.out_files[dataset_file_name_base_r1].store_entry(e) "TODO: make a method:" short_id1 = e.header_line.split()[0] short_id2 = ":".join(e.header_line.split()[1].split(":")[1:]) id2 = short_id1 + " 2:" + short_id2 self.id_dataset_idx[id2] = ini_run_key else: self.out_files["unknown"].store_entry(e) def read2(self, files_r2, compressed): "3) e.pair_no = 2, find id from 2), assign dataset_name" for file_r2 in files_r2: print "FFF2: file %s" % file_r2 f_input = fq.FastQSource(file_r2, compressed) while f_input.next(): e = f_input.entry if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx): file_name = self.id_dataset_idx[e.header_line] + "_R2" self.out_files[file_name].store_entry(e) else: self.out_files["unknown"].store_entry(e) def get_index(self, file_r1): file_name_parts = os.path.basename(file_r1).split("_") # if the file name starts with "IDX, then actual idx will be next. index = file_name_parts[0] if file_name_parts[0].startswith("IDX"): index = file_name_parts[1] return index
def __init__(self, runobj=None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix self.cluster_slots = { "grendel": [12, 8], "cricket": [40], "cluster5": [32] } try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] = '/opt/sge' os.environ['SGE_CELL'] = 'grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.idx_keys = convert_unicode_dictionary_to_str( json.loads( open(self.runobj.trim_status_file_name, "r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd if self.utils.is_local(): self.usearch_cmd = C.usearch6_cmd_local #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb if self.utils.is_local(): self.refdb_local = C.chimera_checking_refdb_local self.its_refdb = C.chimera_checking_its_refdb self.input_file_names = self.make_chimera_input_illumina_file_names()
class IlluminaFiles: """ 0) from run create all dataset_lines names files in output dir 1) split fastq files from casava into files with dataset_names 2) create ini files 3) process them through Meren's script 4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload() """ def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) def split_files(self, compressed = False): """ TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?) """ # print "compressed = %s" % compressed # compressed = ast.literal_eval(compressed) (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path) self.read1(in_files_r1, compressed) self.read2(in_files_r2, compressed) self.create_inis() self.close_dataset_files() # self.perfect_reads() # self.uniq_fa() def open_dataset_files(self): file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()] for f_name in file_name_base: output_file = os.path.join(self.out_file_path, f_name + ".fastq") self.out_files[f_name] = fq.FastQOutput(output_file) self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq")) def close_dataset_files(self): [o_file[1].close() for o_file in self.out_files.iteritems()] return def get_all_files(self): files = {} for dirname, dirnames, filenames in os.walk(self.out_file_path): for file_name in filenames: full_name = os.path.join(dirname, file_name) (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name)) files[full_name] = (file_base, file_extension) # print "len(files) = %s" % len(files) return files def perfect_reads(self): print "Extract perfect V6 reads:" for idx_key in self.runobj.samples.keys(): file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.perfect_overlap_cmd if self.utils.is_local(): program_name = C.perfect_overlap_cmd_local try: if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'): call([program_name, file_name, "--archaea"]) else: call([program_name, file_name]) except: print "Problems with program_name = %s, file_name = %s" % (program_name, file_name) raise def call_sh_script(self, script_name_w_path, where_to_run): try: call(['chmod', '0774', script_name_w_path]) call(['qsub', script_name_w_path], cwd=(where_to_run)) # pass except: print "Problems with script_name = %s" % (script_name_w_path) raise def perfect_reads_cluster(self): print "Extract perfect V6 reads:" program_name = C.perfect_overlap_cmd if self.utils.is_local(): program_name = C.perfect_overlap_cmd_local primer_suite = self.get_config_values('primer_suite') if any("Archaeal" in s for s in primer_suite): add_arg = " --archaea" else: add_arg = "" command_line = program_name + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) return script_file_name def partial_overlap_reads_cluster(self): print "Extract partial_overlap V4V5 reads:" program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local dna_region = self.get_config_values('dna_region') if ("ITS1" in list(dna_region)): add_arg = "--marker-gene-stringent" else: add_arg = "" # TODO: this part is the same in perfect overlap - move into a method command_line = program_name + " --enforce-Q30-check " + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) return script_file_name def partial_overlap_reads(self): print "Extract partial_overlap V4V5 reads:" for idx_key in self.runobj.samples.keys(): ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local try: if (self.runobj.samples[idx_key].dna_region == "ITS1"): call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name]) else: call([program_name, "--enforce-Q30-check", ini_file_name]) # call([program_name, ini_file_name]) # call([program_name, ini_file_name, idx_key]) # call([program_name, "--fast-merge", ini_file_name, idx_key]) except Exception: # except Exception, err: print traceback.format_exc() #or # print sys.exc_info()[0] print "Problems with program_name = %s" % (program_name) raise # print "HERE: program_name = " % (program_name) # call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key]) def get_config_values(self, key): config_path_data = [v for k, v in self.runobj.configPath.items()] return set([a[key] for a in config_path_data if key in a.keys()]) def make_users_email(self): username = getpass.getuser() return username + "@mbl.edu" def create_job_array_script(self, command_line, dir_to_run, files_list): files_string = " ".join(files_list) files_list_size = len(files_list) command_file_name = os.path.basename(command_line.split(" ")[0]) script_file_name = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) log_file_name = script_file_name + ".sge_script.sh.log" email_mbl = self.make_users_email() text = ( '''#!/bin/bash #$ -cwd #$ -S /bin/bash #$ -N %s # Giving the name of the output log file #$ -o %s # Combining output/error messages into one file #$ -j y # Send mail to these users #$ -M %s # Send mail at job end; -m eas sends on end, abort, suspend. #$ -m eas #$ -t 1-%s # Now the script will iterate %s times. file_list=(%s) i=$(expr $SGE_TASK_ID - 1) # echo "i = $i" source ~/.bashrc module load bioware echo "%s ${file_list[$i]}" %s ${file_list[$i]} ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.open_write_close(script_file_name_full, text) return script_file_name def filter_mismatches_cluster(self, max_mismatch = 3): print "Filter mismatches if more then %s" % (max_mismatch) command_line = C.filter_mismatch_cmd if self.utils.is_local(): command_line = C.filter_mismatch_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, "_MERGED") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.call_sh_script(script_file_name_full, files_dir) return script_file_name def filter_mismatches(self, max_mismatch = 3): print "Filter mismatches if more then %s" % (max_mismatch) n = 0 files = self.get_all_files() for full_name in files.keys(): if files[full_name][0].endswith('_MERGED'): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.filter_mismatch_cmd if self.utils.is_local(): program_name = C.filter_mismatch_cmd_local # output_flag = "--output " + full_name + "_FILTERED" # TODO: Remove!!! # output_flag = "-o " + full_name + "_FILTERED" # output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED" # print "output_flag = %s" % (output_flag) # print "%s %s %s" % (program_name, full_name, output_flag) # call([program_name, full_name, output_flag]) call([program_name, full_name]) def uniq_fa_cluster(self): print "Uniqueing fasta files" command_line = C.fastaunique_cmd if self.utils.is_local(): command_line = C.fastaunique_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix) if len(file_list) == 0: file_list = self.dirs.get_all_files_by_ext(files_dir, ".fa") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.call_sh_script(script_file_name_full, files_dir) return script_file_name def uniq_fa(self): n = 0 print "Uniqueing fasta files" files = self.get_all_files() for full_name in files.keys(): # if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'): if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix): n +=1 # print "%s fasta file: %s" % (n, full_name) program_name = C.fastaunique_cmd if self.utils.is_local(): program_name = C.fastaunique_cmd_local call([program_name, full_name]) def get_primers(self): proximal_primer = "" distal_primer = "" primers = {} for idx_key in self.runobj.samples.keys(): if self.runobj.samples[idx_key].primer_suite in C.primers_dict: proximal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["proximal_primer"] distal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["distal_primer"] # if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V4-V5'): # proximal_primer = "G[C,T][C,T]TAAA..[A,G][C,T][C,T][C,T]GTAGC" # distal_primer = "CCGGCGTTGA.TCCAATT" # elif self.runobj.samples[idx_key].primer_suite.startswith('Bacterial V4-V5'): # proximal_primer = "CCAGCAGC[C,T]GCGGTAA." # distal_primer = "CCGTC[A,T]ATT[C,T].TTT[G,A]A.T" # elif self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V6mod'): # proximal_primer = "AATTGGCGGGGGAGCAC" # distal_primer = "GCCATGCACC[A,T]CCTCT" # elif self.runobj.samples[idx_key].primer_suite.startswith('Fungal ITS1'): # proximal_primer = "GTAAAAGTCGTAACAAGGTTTC" # distal_primer = "GTTCAAAGA[C,T]TCGATGATTCAC" else: print "ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'" % (self.runobj.samples[idx_key].primer_suite) primers[idx_key] = (proximal_primer, distal_primer) return primers def create_inis(self): primers = self.get_primers() for idx_key in self.runobj.samples.keys(): run_key = idx_key.split('_')[1].replace("N", "."); email = self.runobj.samples[idx_key].email # for dataset in self.dataset_emails.keys(): # dataset_idx_base = dataset + "_" + self.dataset_index[dataset] # print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset]) text = """[general] project_name = %s researcher_email = %s input_directory = %s output_directory = %s [files] pair_1 = %s pair_2 = %s """ % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq") "That's for v4v5 miseq illumina" if not self.runobj.do_perfect: text += """ # following section is optional [prefixes] pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1] ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") self.open_write_close(ini_file_name, text) def open_write_close(self, script_file_name, text): ini_file = open(script_file_name, "w") ini_file.write(text) ini_file.close() def get_fastq_file_names(self, f_input_file_path): in_files_r1 = [] in_files_r2 = [] "TODO: exclude dir with new created files from the loop" for dirname, dirnames, filenames in os.walk(f_input_file_path): for filename in filenames: if filename.find('_R1_') > 0: in_files_r1.append(os.path.join(dirname, filename)) elif filename.find('_R2_') > 0: in_files_r2.append(os.path.join(dirname, filename)) else: sys.stderr.write("No read number in the file name: %s\n" % filename) return (in_files_r1, in_files_r2) def read1(self, files_r1, compressed): """ loop through the fastq_file_names 1) e.pair_no = 1, find run_key -> dataset name 2) collect the relevant part of id """ for file_r1 in files_r1: print "FFF1: file %s" % file_r1 index_sequence = self.get_index(file_r1) f_input = fq.FastQSource(file_r1, compressed) while f_input.next(): e = f_input.entry ini_run_key = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number # ini_run_key = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1: dataset_file_name_base_r1 = ini_run_key + "_R1" if (dataset_file_name_base_r1 in self.out_files.keys()): self.out_files[dataset_file_name_base_r1].store_entry(e) "TODO: make a method:" short_id1 = e.header_line.split()[0] short_id2 = ":".join(e.header_line.split()[1].split(":")[1:]) id2 = short_id1 + " 2:" + short_id2 self.id_dataset_idx[id2] = ini_run_key else: self.out_files["unknown"].store_entry(e) def read2(self, files_r2, compressed): "3) e.pair_no = 2, find id from 2), assign dataset_name" for file_r2 in files_r2: print "FFF2: file %s" % file_r2 f_input = fq.FastQSource(file_r2, compressed) while f_input.next(): e = f_input.entry if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx): file_name = self.id_dataset_idx[e.header_line] + "_R2" self.out_files[file_name].store_entry(e) else: self.out_files["unknown"].store_entry(e) def get_index(self, file_r1): file_name_parts = os.path.basename(file_r1).split("_") # if the file name starts with "IDX, then actual idx will be next. index = file_name_parts[0] if file_name_parts[0].startswith("IDX"): index = file_name_parts[1] return index
class IlluminaFiles: """ 0) from run create all dataset_lines names files in output dir 1) split fastq files from casava into files with dataset_names 2) create ini files 3) process them through Meren's script 4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload() """ def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) self.platform = self.runobj.platform def split_files(self, compressed = False): """ TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?) """ # logger.debug("compressed = %s" % compressed) # compressed = ast.literal_eval(compressed) (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path) # correct_file_names = self.get_correct_file_names(in_files_r1) if (len(in_files_r1) > 0): self.read1(in_files_r1, compressed) self.read2(in_files_r2, compressed) self.create_inis() else: # logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.") # logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.") self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.") self.close_dataset_files() # self.perfect_reads() # self.uniq_fa() def open_dataset_files(self): file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()] for f_name in file_name_base: output_file = os.path.join(self.out_file_path, f_name + ".fastq") self.out_files[f_name] = fq.FastQOutput(output_file) self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq")) def close_dataset_files(self): [o_file[1].close() for o_file in self.out_files.items()] return # def perfect_reads(self): # self.utils.print_both("Extract perfect V6 reads:") # for idx_key in self.runobj.samples.keys(): # file_name = os.path.join(self.out_file_path, idx_key + ".ini") # program_name = C.perfect_overlap_cmd # if self.utils.is_local(): # program_name = C.perfect_overlap_cmd_local # try: # if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'): # call([program_name, file_name, "--archaea"]) # else: # call([program_name, file_name]) # except: # self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name)) # raise # # TODO: use from util def call_sh_script(self, script_name_w_path, where_to_run): try: call(['chmod', '0774', script_name_w_path]) if self.utils.is_local(): self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))") call(['bash', script_name_w_path], cwd=(where_to_run)) else: call(['qsub', script_name_w_path], cwd=(where_to_run)) # pass except: self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path)) raise # todo: combine and DRY with partial - it's the same command, different arguments def merge_perfect(self): self.utils.print_both("merge perfect V6 reads:") program_name = C.perfect_overlap_cmd if self.utils.is_local(): program_name = C.perfect_overlap_cmd_local add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0" command_line = program_name + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) return script_file_name def trim_primers_perfect(self): self.utils.print_both("trim primers from perfect V6 reads:") merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED") primer_suite = self.get_config_values('primer_suite') add_arg = "" if any([s.lower().startswith("archaeal") for s in primer_suite]): add_arg += " --archaea" program_name = C.trim_primers_cmd + add_arg script_file_name = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names) script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir) return script_file_name """ def perfect_reads_cluster(self): ''' iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0 Each flag is critical. marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps. This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers. Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script: iu-trim-V6-primers test_MERGED ''' self.utils.print_both("Extract perfect V6 reads:") script_file_name = self.merge_perfect() trim_script_file_name = self.trim_primers_perfect() return (script_file_name, trim_script_file_name) """ def partial_overlap_reads_cluster(self): self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads_cluster):") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local dna_region = self.get_config_values('dna_region') if set(C.marker_gene_stringent_regions) & set(list(dna_region)): add_arg = "--marker-gene-stringent" else: add_arg = "" # TODO: this part is the same in perfect overlap - move into a method command_line = program_name + " --enforce-Q30-check " + add_arg file_list = self.dirs.get_all_files_by_ext(self.out_file_path, "ini") script_file_name = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list) script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name) self.call_sh_script(script_file_name_full, self.dirs.analysis_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir)) self.dirs.chmod_all(self.dirs.analysis_dir) return script_file_name def partial_overlap_reads(self): self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads):") for idx_key in self.runobj.samples.keys(): ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") program_name = C.partial_overlap_cmd if self.utils.is_local(): program_name = C.partial_overlap_cmd_local try: if set(C.marker_gene_stringent_regions) & set(list(self.runobj.samples[idx_key].dna_region)): # if (self.runobj.samples[idx_key].dna_region == "ITS1"): call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name]) else: call([program_name, "--enforce-Q30-check", ini_file_name]) # call([program_name, ini_file_name]) # call([program_name, ini_file_name, idx_key]) # call([program_name, "--fast-merge", ini_file_name, idx_key]) except Exception: # except Exception, err: message = traceback.format_exc() self.utils.print_both(message) #or # logger.debug(sys.exc_info()[0]) self.utils.print_both("Problems with program_name = %s" % (program_name)) raise # logger.debug("HERE: program_name = " % (program_name)) # call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key]) def get_config_values(self, key): config_path_data = [v for k, v in self.runobj.configPath.items()] return set([a[key] for a in config_path_data if key in a.keys()]) # TODO: use from util def make_users_email(self): username = getpass.getuser() return username + "@mbl.edu" # TODO: use from util # Removed by Hilary's request: # # Send mail at job end (e); -m as sends abort, suspend. # #$ -m as def create_job_array_script(self, command_line, dir_to_run, files_list): files_string = " ".join(files_list) files_list_size = len(files_list) command_file_name = os.path.basename(command_line.split(" ")[0]) script_file_name = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) log_file_name = script_file_name + ".sge_script.sh.log" # email_mbl = self.make_users_email() email_mbl = C.email_mbl text = ( '''#!/bin/bash #$ -cwd #$ -S /bin/bash #$ -N %s # Giving the name of the output log file #$ -o %s # Combining output/error messages into one file #$ -j y # Send mail to these users #$ -M %s #$ -t 1-%s # Now the script will iterate %s times. file_list=(%s) i=$(expr $SGE_TASK_ID - 1) # echo "i = $i" # . /etc/profile.d/modules.sh # . /xraid/bioware/bioware-loader.sh shopt -s expand_aliases # It will expand aliases that are loaded via modules . /xraid/bioware/Modules/etc/profile.modules module load bioware echo "%s ${file_list[$i]}" %s ${file_list[$i]} ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.open_write_close(script_file_name_full, text) return script_file_name def filter_mismatches_cluster(self, max_mismatch = 3): self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch)) command_line = C.filter_mismatch_cmd if self.utils.is_local(): command_line = C.filter_mismatch_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, "_MERGED") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.utils.call_sh_script(script_file_name_full, files_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir)) self.dirs.chmod_all(files_dir) return script_file_name def filter_mismatches(self, max_mismatch = 3): self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch)) n = 0 files = self.dirs.get_all_files() for full_name in files.keys(): if files[full_name][0].endswith('_MERGED'): n +=1 # logger.debug("%s fasta file: %s" % (n, full_name)) program_name = C.filter_mismatch_cmd if self.utils.is_local(): program_name = C.filter_mismatch_cmd_local call([program_name, full_name]) def uniq_fa_cluster(self): self.utils.print_both("Uniqueing fasta files") command_line = C.fastaunique_cmd if self.utils.is_local(): command_line = C.fastaunique_cmd_local files_dir = self.dirs.reads_overlap_dir file_list = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix) if len(file_list) == 0: file_list = self.dirs.get_all_files_by_ext(files_dir, ".fa") if len(file_list) == 0: file_list = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED") script_file_name = self.create_job_array_script(command_line, files_dir, file_list) script_file_name_full = os.path.join(files_dir, script_file_name) self.call_sh_script(script_file_name_full, files_dir) self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir)) self.dirs.chmod_all(files_dir) return script_file_name def uniq_fa(self): n = 0 self.utils.print_both("Uniqueing fasta files") files = self.dirs.get_all_files() for full_name in files.keys(): # if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'): if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix): n +=1 program_name = C.fastaunique_cmd if self.utils.is_local(): program_name = C.fastaunique_cmd_local call([program_name, full_name]) def get_primers(self): proximal_primer = "" distal_primer = "" primers = {} for idx_key in self.runobj.samples.keys(): primer_suite = self.runobj.samples[idx_key].primer_suite.lower() if primer_suite in C.primers_dict: proximal_primer = C.primers_dict[primer_suite]["proximal_primer"] distal_primer = C.primers_dict[primer_suite]["distal_primer"] else: self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite)) primers[idx_key] = (proximal_primer, distal_primer) return primers def create_inis(self): for idx_key in self.runobj.samples.keys(): run_key = idx_key.split('_')[1].replace("N", "."); "todo: check if works w/o NNNN when there is a proper csv" email = self.runobj.samples[idx_key].email text = """[general] project_name = %s researcher_email = %s input_directory = %s output_directory = %s [files] pair_1 = %s pair_2 = %s """ % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq") "That's for parital overlap (v4v5 and hapto miseq illumina)" if not self.runobj.do_perfect: primers = self.get_primers() # logger.debug("run_key = %s, idx_key = %s, primers[idx_key][0], primers[idx_key][1] = %s" (run_key, idx_key, primers[idx_key][0], primers[idx_key][1])) text += """ # following section is optional [prefixes] pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1] ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini") self.open_write_close(ini_file_name, text) # TODO: use from utils def open_write_close(self, script_file_name, text): ini_file = open(script_file_name, "w") ini_file.write(text) ini_file.close() def get_fastq_file_names(self, f_input_file_path): in_files_r1 = [] in_files_r2 = [] "TODO: exclude dir with new created files from the loop" for dirname, dirnames, filenames in os.walk(f_input_file_path): correct_file_names = self.get_correct_file_names(filenames) for filename in sorted(list(correct_file_names)): if filename.find('_R1_') > 0: in_files_r1.append(os.path.join(dirname, filename)) elif filename.find('_R2_') > 0: in_files_r2.append(os.path.join(dirname, filename)) else: sys.stderr.write("No read number in the file name: %s\n" % filename) self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2)) return (in_files_r1, in_files_r2) def get_correct_file_names(self, filenames): correct_file_names = []; for file1 in filenames: index_sequence = self.get_index(file1) # self.runobj.run_keys # good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)] if len(good_run_key_lane_names) > 0: correct_file_names.append(file1) return set(correct_file_names) def get_run_key(self, e_sequence, has_ns = "True"): if has_ns: return ("NNNN" + e_sequence[4:9]) else: return e_sequence[0:5] def get_ini_run_key(self, index_sequence, e): has_ns = any("NNNN" in s for s in self.runobj.run_keys) lane_number = e.lane_number if self.platform == "nextseq": lane_number = "1" return index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number def read1(self, files_r1, compressed): """ loop through the fastq_file_names 1) e.pair_no = 1, find run_key -> dataset name 2) collect the relevant part of id """ for file_r1 in files_r1: self.utils.print_both("====\nFFF1: file %s" % file_r1) f_input = fq.FastQSource(file_r1, compressed) index_sequence = self.get_index(file_r1) while f_input.next(trim_to = C.trimming_length): # while f_input.next(trim_to = C.trimming_length[self.platform]): e = f_input.entry # todo: a fork with or without NNNN, add an argument # ini_run_key = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number # lane_number = e.lane_number # if self.platform == "nextseq": # lane_number = "1" # ini_run_key = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number ini_run_key = self.get_ini_run_key(index_sequence, e) if int(e.pair_no) == 1: dataset_file_name_base_r1 = ini_run_key + "_R1" if (dataset_file_name_base_r1 in self.out_files.keys()): self.out_files[dataset_file_name_base_r1].store_entry(e) "TODO: make a method:" short_id1 = e.header_line.split()[0] short_id2 = ":".join(e.header_line.split()[1].split(":")[1:]) id2 = short_id1 + " 2:" + short_id2 self.id_dataset_idx[id2] = ini_run_key else: self.out_files["unknown"].store_entry(e) # def truncate_seq(self, seq): # return seq[:C.trimming_length] def remove_end_ns_strip(self, e_sequence): if e_sequence.endswith('N'): return e_sequence.rstrip('N') else: return e_sequence def read2(self, files_r2, compressed): "3) e.pair_no = 2, find id from 2), assign dataset_name" for file_r2 in files_r2: self.utils.print_both("FFF2: file %s" % file_r2) f_input = fq.FastQSource(file_r2, compressed) while f_input.next(trim_to = C.trimming_length): e = f_input.entry if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx): file_name = self.id_dataset_idx[e.header_line] + "_R2" self.out_files[file_name].store_entry(e) else: self.out_files["unknown"].store_entry(e) def get_index(self, file_r1): file_name_parts = os.path.basename(file_r1).split("_") # if the file name starts with "IDX, then actual idx will be next. index = file_name_parts[0] if file_name_parts[0].startswith("IDX"): index = file_name_parts[1] return index
class Chimera: """ Define here """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names() # pprint(self.run_keys) # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names # def make_chimera_output_illumina_file_names(self, input_file_names): # output_file_names = {} # for idx_key, input_file_name in input_file_names.iteritems(): # output_file_names[idx_key] = input_file_name # return output_file_names def get_current_dirname(self, in_or_out = ""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))] return cur_file_names # def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="): # cur_dirname = self.get_current_dirname(in_or_out) # cur_file_names = self.get_current_filenames(cur_dirname) # # print "cur_file_names: " # # pprint(cur_file_names) # change_from_suffix = "" # change_to_suffix = self.chg_suffix # # print "find = %s, replace = %s" % (find, replace) # regex = re.compile(r"%s" % find) # # for cur_file_name in cur_file_names: # file_name = os.path.join(cur_dirname, cur_file_name) # with open(file_name + change_from_suffix, "r") as sources: # lines = sources.readlines() # with open(file_name + change_to_suffix, "w") as target: # for line in lines: # target.write(regex.sub(replace, line)) def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase') from_frequency_to_size = sed_from_to( find = "frequency:", replace = ";size=", cur_dirname = self.indir, cur_file_names = self.get_current_filenames(self.indir), change_from_suffix = "", change_to_suffix = self.chg_suffix, uppercase = True ) from_size_to_frequency = sed_from_to( find = ";size=", replace = "frequency:", cur_dirname = self.outdir, cur_file_names = self.get_chimera_file_names(self.outdir), change_from_suffix = "", change_to_suffix = "", uppercase = False ) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # print "find = %s, replace = %s" % (find, replace) if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size): self.utils.print_both('ERROR: Did not find uniqued files (".unique") in %s, please check if the previous step has finished. Exiting.\n' % self.indir) sys.exit() for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): # TODO: not used? find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # print "cur_file_names: " # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # print "find = %s, replace = %s" % (find, replace) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) with open(file_name + change_from_suffix, "r") as sources: lines = sources.readlines() with open(file_name + change_to_suffix, "w") as target: # line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines] for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # print line1 target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) with open(file_chim_path, "r") as sources: lines = sources.readlines() with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): os.remove(file_name) # def illumina_chimera_size_files(self): # # import os # [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')] def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep usearch" self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) self.utils.print_both("qstat is running %s 'usearch' processes" % num_proc) # pprint(p) if (num_proc == 0): cluster_done = True # print "cluster_done from check_if_cluster_is_done = %s" % cluster_done except: self.utils.print_both("Chimera checking can be done only on a cluster.") raise return cluster_done def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: self.utils.print_both("Incorrect method, should be \"denovo\" or \"ref\"") self.utils.print_both("output_file_name = %s" % output_file_name) uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += uchime_cmd_append + input_file_name uchime_cmd += db_cmd_append uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here # uchime_cmd += " -nonchimeras " # uchime_cmd += (output_file_name + self.nonchimeric_suffix) """ uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" # print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd) return uchime_cmd def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb return ref_db def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: self.utils.print_both("Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e)) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The usearch commands were created")
class dbUpload: """db upload methods""" Name = "dbUpload" """ TODO: add tests and test case TODO: change hardcoded values to args: self.sequence_table_name = "sequence_ill", self.sequence_field_name = "sequence_comp" TODO: generalize all bulk uploads and all inserts? to not copy and paste TODO: add refssu_id TODO: change csv validaton for new fields Order: # put_run_info # insert_seq() # insert_pdr_info() # gast # insert_taxonomy() # insert_sequence_uniq_info_ill() """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 self.unique_fasta_files = [] # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.fasta_dir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() # self.my_conn = MyConnection(host = 'localhost', db="test_env454") self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = self.dirs.unique_file_counts self.dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique self.v6_unique_suffix = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix] # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = "" # self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/' def get_fasta_file_names(self): files_names = self.dirs.get_all_files(self.fasta_dir) self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))] # needs return because how it's called from pipelineprocesor return self.unique_fasta_files def get_run_info_ill_id(self, filename_base): my_sql = """SELECT run_info_ill_id FROM run_info_ill JOIN run using(run_id) WHERE file_prefix = '%s' and run = '%s' """ % (filename_base, self.rundate) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def make_seq_upper(self, filename): read_fasta = fastalib.ReadFasta(filename) sequences = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility read_fasta.close() return sequences def insert_seq(self, sequences): query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" val_tmpl = "'%s'" my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) seq_id = self.my_conn.execute_no_fetch(my_sql) self.utils.print_both("sequences in file: %s\n" % (len(sequences))) return seq_id # try: # query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" # val_tmpl = "'%s'" # my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) # seq_id = self.my_conn.execute_no_fetch(my_sql) # # print "sequences in file: %s" % (len(sequences)) # self.utils.print_both("sequences in file: %s\n" % (len(sequences))) # return seq_id # except self.my_conn.conn.cursor._mysql_exceptions.Error as err: # if err.errno == 1582: # self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir) # else: # raise # except: # if len(sequences) == 0: # self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) # raise def get_seq_id_dict(self, sequences): id_name = self.sequence_table_name + "_id" query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))""" val_tmpl = "'%s'" try: my_sql = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences])) res = self.my_conn.execute_fetch_select(my_sql) one_seq_id_dict = dict((y, int(x)) for x, y in res) self.seq_id_dict.update(one_seq_id_dict) except: if len(sequences) == 0: self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) raise def get_id(self, table_name, value): id_name = table_name + '_id' my_sql = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def get_sequence_id(self, seq): my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def insert_pdr_info(self, fasta, run_info_ill_id): res_id = "" if (not run_info_ill_id): self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454") # ------- insert sequence info per run/project/dataset -------- seq_upper = fasta.seq.upper() sequence_ill_id = self.seq_id_dict[seq_upper] seq_count = int(fasta.id.split('|')[-1].split(':')[-1]) # print run_info_ill_id, sequence_ill_id, seq_count my_sql = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count) try: res_id = self.my_conn.execute_no_fetch(my_sql) return res_id except: self.utils.print_both("Offensive query: %s" % my_sql) raise def make_gast_files_dict(self): return self.dirs.get_all_files(self.gast_dir, "gast") def gast_filename(self, filename): # todo: if filename in make_gast_files_dict, use it full path gast_file_names = self.make_gast_files_dict() gast_file_name_path = "" for gast_file_name_path, tpls in gast_file_names.iteritems(): if any(t.endswith(filename) for t in tpls): return gast_file_name_path def get_gast_result(self, filename): gast_file_name = self.gast_filename(filename) self.utils.print_both("current gast_file_name = %s." % gast_file_name) try: with open(gast_file_name) as fd: gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd]) return gast_dict except IOError, e: # print dir(e) #['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror'] # print "errno = %s" % e.errno logger.debug("errno = %s" % e.errno) if e.errno == 2: # suppress "No such file or directory" error pass # except OSError, e: except TypeError, e: self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename)) pass
def put_seq_statistics_in_file(self, filename, seq_in_file): pipelne_utils = PipelneUtils() # if os.path.exists(file_full): # os.remove(file_full) pipelne_utils.write_seq_frequencies_in_file(self.unique_file_counts, filename, seq_in_file)
class Chimera: """ Define here """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names() # pprint(self.run_keys) # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names # def make_chimera_output_illumina_file_names(self, input_file_names): # output_file_names = {} # for idx_key, input_file_name in input_file_names.iteritems(): # output_file_names[idx_key] = input_file_name # return output_file_names def get_current_dirname(self, in_or_out = ""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))] return cur_file_names # def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="): # cur_dirname = self.get_current_dirname(in_or_out) # cur_file_names = self.get_current_filenames(cur_dirname) # # print "cur_file_names: " # # pprint(cur_file_names) # change_from_suffix = "" # change_to_suffix = self.chg_suffix # # print "find = %s, replace = %s" % (find, replace) # regex = re.compile(r"%s" % find) # # for cur_file_name in cur_file_names: # file_name = os.path.join(cur_dirname, cur_file_name) # with open(file_name + change_from_suffix, "r") as sources: # lines = sources.readlines() # with open(file_name + change_to_suffix, "w") as target: # for line in lines: # target.write(regex.sub(replace, line)) def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase') from_frequency_to_size = sed_from_to( find = "frequency:", replace = ";size=", cur_dirname = self.indir, cur_file_names = self.get_current_filenames(self.indir), change_from_suffix = "", change_to_suffix = self.chg_suffix, uppercase = True ) from_size_to_frequency = sed_from_to( find = ";size=", replace = "frequency:", cur_dirname = self.outdir, cur_file_names = self.get_chimera_file_names(self.outdir), change_from_suffix = "", change_to_suffix = "", uppercase = False ) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # print "find = %s, replace = %s" % (find, replace) for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): # TODO: not used? find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # print "cur_file_names: " # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # print "find = %s, replace = %s" % (find, replace) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) with open(file_name + change_from_suffix, "r") as sources: lines = sources.readlines() with open(file_name + change_to_suffix, "w") as target: # line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines] for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # print line1 target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) with open(file_chim_path, "r") as sources: lines = sources.readlines() with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): os.remove(file_name) # def illumina_chimera_size_files(self): # # import os # [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')] def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep usearch" print "check_qstat_cmd_line = %s" % check_qstat_cmd_line try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) print "qstat is running %s 'usearch' processes" % num_proc # pprint(p) if (num_proc == 0): cluster_done = True # print "cluster_done from check_if_cluster_is_done = %s" % cluster_done except: print "Chimera checking can be done only on a cluster." raise return cluster_done def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: print "Incorrect method, should be \"denovo\" or \"ref\"" print "output_file_name = %s" % output_file_name uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += uchime_cmd_append + input_file_name uchime_cmd += db_cmd_append uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here # uchime_cmd += " -nonchimeras " # uchime_cmd += (output_file_name + self.nonchimeric_suffix) """ uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" # print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd) return uchime_cmd def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb return ref_db def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: print "Problems with this command: %s" % (uchime_cmd) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The usearch commands were created")
class MetadataUtils: """ Class to read metadata files (csv and ini style) validate and create a dictionary from them Two parts: 1) From pipeline-ui.py to validate the input args 2) From runconfig.py to write the final ini file and create the dictionary that is used to create the run object """ Name = "MetadataUtils" def __init__(self, command_line_args = None, configuration_dictionary = None): self.args = command_line_args self.general_config_dict = configuration_dictionary self.known_header_list = C.csv_header_list self.pipeline_run_items = C.pipeline_run_items self.primer_suites = self.convert_primer_suites(C.primer_suites) self.dna_regions = C.dna_regions self.data_object = {} self.data_object['general'] = {} self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct then press 'c' to continue the pipeline\n""" self.res_headers = [] self.env = {} self.utils = PipelneUtils() def convert_and_save_ini(self, analysis_dir): new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini') # converts csv to ini and saves to output_dir if self.general_config_dict['platform'] == 'vamps': self.save_ini_file(new_ini_file) else: self.convert_csv_to_ini(new_ini_file) self.general_config_dict['configPath'] = new_ini_file # change path and type to new ini # regardless of what they were before def validate(self, analysis_dir): if self.general_config_dict['platform'] in C.illumina_list: self.warn_msg = self.validate_illumina_ini(analysis_dir) elif self.general_config_dict['platform'] == '454': data = self.validate_454_ini(analysis_dir) elif self.general_config_dict['platform'] == 'ion_torrent': pass elif self.general_config_dict['platform'] == 'vamps': data = self.validate_vamps_ini(analysis_dir) else: sys.exit("Unknown platform and configFile type for validation") return self.data_object def get_general_data(self): """ """ return self.data_object['general'] def validate_vamps_ini(self, analysis_dir): # configPath is the new configPath 'todo: Andy, what should be here, just directory name or directory + number.ini?' self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] ) elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] ) def validate_454_ini(self, analysis_dir): print("TODO - write validation def for 454/ini") #self.data_object = self.create_dictionary_from_ini() # 454 ini file requirements: def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print("Validating ini type Config File (may have been converted from csv)") new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print("New ini file location: "+new_ini_file) return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print('configpath',self.general_config_dict['configPath']) # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print(self.data_object['input_dir']) #print(self.data_object['input_files']) if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list: file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m") else: print("\033[92mCSV File Passed Vaidation!\033[0m") return msg def validate_dictionary(self, config_info): """ This is only used for data that comes in as a dictionary rather than a file such as with vamps user uploads """ print("TODO - Validating input dictionary") # must be a general section # should I create a dict here??? -That would render much code in # runconfig useless. # are we going to continue developing ini style config files if # no one uses them? configDict = config_info return configDict def populate_data_object_454(self, args): data = {} data['general'] = {} test_datasets = {} dataset_counter = {} headers = '' if self.runobj: infile = self.runobj.configPath else: infile = args.configPath data['general']['input_dir'] = args.input_dir #data['general']['output_dir'] = os.path.join(args.output_dir,args.run) data['general']['output_dir'] = args.output_dir data['general']['platform'] = args.platform data['general']['run'] = args.run #data['general']['run_date'] = args.run data['general']["input_file_format"] = args.input_file_format data['general']["input_file_suffix"] = args.input_file_suffix return data['general'] def get_input_files(self): files_list = [] if os.path.isdir(self.general_config_dict['input_dir']): for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ): if os.path.isdir(infile) == True: for infile2 in glob.glob( os.path.join( infile,'*') ): if os.path.isdir(infile2) == True: pass else: sub_dir = os.path.basename(infile) files_list.append(os.path.join(sub_dir,os.path.basename(infile2))) else: files_list.append(os.path.basename(infile)) # else: # if fasta_file: # pass # logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir']) return files_list def check_for_input_files(self, data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print(data_object[x]['file_prefix']) #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print("No input directory or directory permissions problem: "+input_dir) if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print("Files LIST",data_object['general']['files_list']) return data_object def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn) def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print('ds',data[item]['dataset']) if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn) def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': primer_suite = self.convert_primer_suites(data[item]['primer_suite']) dna_region = self.convert_primer_suites(data[item]['dna_region']) # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if primer_suite not in self.primer_suites: logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")") error=True if dna_region not in self.dna_regions: logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")") error=True if dna_region not in primer_suite: logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")") error=True return (error, warn) def convert_primer_suites(self, suite): import re if type(suite) is list: conv_suite = [re.sub(r'[_ -]', '', item.lower()) for item in suite] if type(suite) is str: conv_suite = re.sub(r'[_ -]', '', suite.lower()) # suite.lower().translate(None, '_- ') return conv_suite def check_project_name(self, data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: ") logger.error(data[item]['project']) logger.error(" - Exiting (key: ") logger.error(data[item]) error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) # logger.error("c[1:] = ") # logger.error(c[1:]) # logger.error("c.lower() =") # logger.error(c.lower()) # logger.error("self.dna_regions") # logger.error(self.dna_regions ) if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions): logger.error("Project suffix has incorrect DNA region: ") logger.error(c) logger.error(" - Exiting (key: ") logger.error(data[item]) error = True return (error, warn) def check_dataset_name(self,data): """ # CHECK: dataset name can be ONLY alphanumeric and underscore and cannot start with a number! """ error =False warn =False for item in data: if item != 'general': dataset_name = data[item]['dataset'] if not re.match("^[A-Za-z0-9_]*$", dataset_name): logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)") error = True #if re.match("^[0-9]", dataset_name): # logger.error("Dataset name cannot begin with a digit: "+dataset_name) # error = True return (error, warn) def get_my_conn(self): try: host = self.general_config_dict['database_host'] except: raise try: db = self.general_config_dict['database_name'] except: raise if self.utils.is_local(): host = 'localhost' db = "test_env454" self.my_conn = MyConnection(host = host, db = db) def check_projects_and_datasets(self, data): self.get_my_conn() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print(p) my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print("\t%s" % (d)) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn) def get_confirmation(self, steps, general_data): print("\n") for item,value in general_data.items(): #print(len(value)) if type(value) != bool and len(value) > 80: tmp = value.split(',') print("%-20s = %s .. %s" % (item,tmp[0],tmp[-1])) else: print("%-20s = %-20s" % (item,value)) print("\nStep(s) to be performed: \033[1;36m",steps,'\033[0m') print("\n"+self.warn_msg+"\n") if 'validate' in steps.split(','): # print(we are done) sys.exit() if self.utils.is_local(): return 'c' else: return 'c' # return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ") def convert_csv_to_ini(self, new_ini_file): #print(self.args) from pipeline.get_ini import readCSV print('CSV path', self.general_config_dict['csvPath']) my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print(content[1]) #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.items(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n") fh.write("platform = " + self.general_config_dict['platform']+"\n") fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] in C.illumina_list: #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("do_perfect = " + str(self.general_config_dict['do_perfect'])+"\n") fh.write("lane_name = " + str(self.general_config_dict['lane_name'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("site = " + self.general_config_dict['site']+"\n") fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k, values in content.items(): fh.write("\n") if self.general_config_dict['platform'] in C.illumina_list: fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: if v == "env_sample_source": try: new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0] except: text = """There was an error in env_sample_source. Please check your metadata. Possible values: ----------- air extreme habitat host associated human associated human-amniotic-fluid human-blood human-gut human-oral human-skin human-urine human-vaginal indoor microbial mat/biofilm miscellaneous_natural_or_artificial_environment plant associated sediment soil/sand unknown wastewater/sludge water-freshwater water-marine ----------- """ print(text) raise fh.write("env_sample_source_id = "+new_val+"\n") else: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file def save_ini_file(self,new_ini_file): # give it a new name out_fh = open(new_ini_file,'w') #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"): # out_fh.write(line) self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") out_fh.write("[general]\n") for item in self.general_config_dict: out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") #for item in self.general_config_dict: # if item not in C.general_run_items: # out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '': (path,fasta) = os.path.split(self.general_config_dict['fasta_file']) if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path: sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file']) out_fh.write("input_dir = "+path+"\n") out_fh.write("input_files = "+fasta+"\n") #out_fh.write("input_file_suffix = fasta\n") elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() out_fh.write("input_files = " + ','.join(file_list)+"\n") else: out_fh.write("input_files = \n") out_fh.close() def check_headers(self, headers): if self.general_config_dict['platform'] in C.illumina_list: pl = self.general_config_dict['platform'] known_header_list = self.known_header_list[pl] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print( sorted(known_header_list)) #print(sorted(headers)) self.res_headers = headers if "env_sample_source" in headers: self.env_source_to_id(headers) if sorted(known_header_list) != sorted(self.res_headers): print("=" * 40) print("csv file header problem") print("%-20s %-20s" % ("REQUIRED", "YOUR CSV")) for i in sorted(known_header_list): if i in headers: print("%-20s%-20s" % (i,i)) else: print("%-20s%-20s" % (i,"----------- <--- missing")) for i in headers: if i not in known_header_list: print("%-20s%-20s" % (" ",i+" <--- extra")) print("=" * 40) sys.exit("ERROR : unknown or missing headers\n") else: return True def env_source_to_id(self, headers): logger.error("self.utils.is_local() LLL2 metadata") logger.error(self.utils.is_local()) if self.utils.is_local(): self.my_conn = MyConnection(host = 'localhost', db="test_env454") else: self.my_conn = MyConnection(host='bpcdb1', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers] def configDictionaryFromFile_ini(self, config_file_path): import configparser configDict = {} user_config = configparser.ConfigParser() user_config.read(config_file_path) for section in user_config.sections(): section_dict = configDict[section] = {} for option in user_config.options(section): section_dict[option] = user_config.get(section,option) if section_dict[option] == 'True' or section_dict[option] == 'true': section_dict[option] = True elif section_dict[option] == 'False' or section_dict[option] == 'false': section_dict[option] = False return configDict def get_values(self, args, general_config_dict = {} ): collector={} for item in self.pipeline_run_items[args.platform]: # set collector[item] to the default first collector[item] = self.pipeline_run_items[args.platform][item] # now look for args (then ini) values to replace if item in args and getattr( args, item ) != None: collector[item] = getattr( args, item ) elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '': collector[item] = general_config_dict[args.platform][item] # get all the items from general_config_dict['general'] if 'general' in general_config_dict: for item in general_config_dict['general']: collector[item] = general_config_dict['general'][item] return collector def validate_args(self): """ # THOUGHTS # vamps users # single project and dataset # Supply an ini file OR commandline (for web interface), but no csv file # # MBL pipeline # REQUIRE a csv file and a ini file """ collector={} if self.args.configPath: general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) if self.args.platform in general_config_dict and 'general' in general_config_dict: collector= self.get_values( self.args, general_config_dict) else: sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.") else: # no configPath collector= self.get_values( self.args ) collector['current_db_host_name'] = self.utils.find_in_nested_dict(C.db_cnf, {'host': collector['database_host'], 'db': collector['database_name']}) if not collector['current_db_host_name']: sys.exit("""Please check -db_host and -db_name parameters, the current combination does not exist: 'db_host' = %s, 'db_name' = %s """ % (collector['database_host'], collector['database_name'])) if self.args.platform in C.illumina_list: print("Starting Illumina Pipeline") if not self.args.csvPath: sys.exit("illumina requires a csv file - Exiting") elif self.args.platform == 'vamps': print("Starting VAMPS Pipeline:") if 'project' not in collector or collector['project'] == '': collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:] else: logger.debug("No project found in vamps pipeline") if self.args.fasta_file: collector['project'] = self.args.fasta_file collector['from_fasta'] = True elif self.args.platform == '454': print("Starting 454 Pipeline") elif self.args.platform == 'ion_torrent': print("Starting Ion Torrent Pipeline") else: sys.exit("Validate args: Unknown Platform") if self.args.configPath: collector['configPath'] = self.args.configPath else: collector['configPath'] = "" # these are all the bool items in the collector # they need to be converted from str to bool here for i in collector: if collector[i] == 'True' or collector[i] == 'true': collector[i] = True elif collector[i] == 'False' or collector[i] == 'false': collector[i] = False #collector['runcode'] = self.args.run collector['run'] = self.args.run #collector['run_date'] = self.args.run #collector['steps'] = self.args.steps collector['platform'] = self.args.platform if self.args.input_dir: collector['input_dir'] = self.args.input_dir collector['date'] = str(datetime.date.today()) #print(collector) return collector
class Chimera: """ Define here """ def __init__(self, runobj=None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix self.cluster_slots = { "grendel": [12, 8], "cricket": [40], "cluster5": [32] } try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] = '/opt/sge' os.environ['SGE_CELL'] = 'grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.idx_keys = convert_unicode_dictionary_to_str( json.loads( open(self.runobj.trim_status_file_name, "r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd if self.utils.is_local(): self.usearch_cmd = C.usearch6_cmd_local #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb if self.utils.is_local(): self.refdb_local = C.chimera_checking_refdb_local self.its_refdb = C.chimera_checking_its_refdb self.input_file_names = self.make_chimera_input_illumina_file_names() # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': ref_db = C.chimera_checking_its_refdb logger.debug("got an ITS dna region so using refdb: " + ref_db) else: ref_db = C.chimera_checking_refdb if self.utils.is_local(): ref_db = C.chimera_checking_refdb_local logger.debug("using standard refdb: " + ref_db) return ref_db def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names def get_current_dirname(self, in_or_out=""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith( (self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [ filename for filename in filenames if (self.is_chimera_check_file(filename)) ] return cur_file_names def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple( 'sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase' ) from_frequency_to_size = sed_from_to( find="frequency:", replace=";size=", cur_dirname=self.indir, cur_file_names=self.get_current_filenames(self.indir), change_from_suffix="", change_to_suffix=self.chg_suffix, uppercase=True) from_size_to_frequency = sed_from_to( find=";size=", replace="frequency:", cur_dirname=self.outdir, cur_file_names=self.get_chimera_file_names(self.outdir), change_from_suffix="", change_to_suffix="", uppercase=False) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # logger.debug("find = %s, replace = %s" % (find, replace)) if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size): self.utils.print_both( 'ERROR: Did not find uniqued files ("%s") in %s, please check if the previous step has finished. Exiting.\n' % (C.filtered_suffix + ".unique", self.indir)) sys.exit() for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # logger.debug("cur_file_names: ") # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # logger.debug("find = %s, replace = %s" % (find, replace)) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) lines = self.utils.read_file(file_name + change_from_suffix) with open(file_name + change_to_suffix, "w") as target: for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # logger.debug(line1) target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) lines = self.utils.read_file(file_chim_path) with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join( self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): pass # os.remove(file_name) def check_if_chimera_dir_empty(self): if not os.listdir(self.outdir): self.utils.print_both( 'ERROR: Did not find files in %s, something is wrong. First check if you ran the command on a cluster. Exiting.\n' % self.outdir) sys.exit() def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep chimera_ch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep vsearch" self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) self.utils.print_both("qstat is running %s 'vsearch' processes" % num_proc) # pprint(p) if (num_proc == 0): cluster_done = True # logger.debug("cluster_done from check_if_cluster_is_done = %s" % cluster_done) except: self.utils.print_both( "Chimera checking can be done only on a cluster.") raise return cluster_done def create_chimera_cmd(self, ref_db): """ /usr/local/bin/vsearch -uchime_denovo /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa -notrunclabels --- /usr/local/bin/vsearch -uchime_ref /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa -notrunclabels -strand plus -db /groups/g454/blastdbs/rRNA16S.gold.fasta """ command_line = [] ref_or_novo_options = { self.denovo_suffix: "-uchime_denovo", self.ref_suffix: "-uchime_ref" } for suff, opt in ref_or_novo_options.items(): input_file_name = self.indir + "/$filename_base" + self.chg_suffix output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff ref_add = "" if (opt == "-uchime_ref"): ref_add = "-strand plus -db %s" % ref_db uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s """ % (self.usearch_cmd, opt, input_file_name, output_file_name, output_file_name, self.chimeric_suffix, ref_add) logger.debug("UUU = uchime_cmd = %s" % uchime_cmd) logger.debug("+++") command_line.append(uchime_cmd) return command_line def create_chimera_cmd_old(self, input_file_name, output_file_name, ref_or_novo, ref_db=""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: self.utils.print_both( "Error: Incorrect method, should be \"denovo\" or \"ref\"") self.utils.print_both("output_file_name = %s" % output_file_name) uchime_cmd = C.clusterize_cmd if self.utils.is_local(): uchime_cmd = "" uchime_cmd += " " uchime_cmd += self.usearch_cmd logger.debug("self.usearch_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)) uchime_cmd += uchime_cmd_append + input_file_name logger.debug("uchime_cmd_append FROM create_chimera_cmd = %s" % (uchime_cmd_append)) uchime_cmd += db_cmd_append logger.debug("db_cmd_append FROM create_chimera_cmd = %s" % (db_cmd_append)) uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here""" uchime_cmd += " -nonchimeras " uchime_cmd += (output_file_name + self.nonchimeric_suffix) uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" logger.debug("uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)) return uchime_cmd def get_sge_cluster_name(self): # import subprocess result = subprocess.run(['qstat', '-F'], stdout=subprocess.PIPE) a1 = result.stdout.decode('utf-8').split() for line in a1: if (line.find("hostname") != -1): #qf:hostname=grendel-01.bpcservers.private return line.split("=")[1].split("-")[0] def get_sge_slot_number( self ): # doesn't work on cricket because: qc:slots=12 and qc:slots=8 result = subprocess.run(['qstat', '-F', 'slots'], stdout=subprocess.PIPE) a1 = result.stdout.decode('utf-8').split() slots = [] for line in a1: if line.startswith('qc:slots'): slots.append(int(line.split("=")[-1])) slots_uniq = set(slots) return max(slots_uniq) # TODO: temp! take from util. change illumina-files to use util, too # create_job_array_script(self, command_line, dir_to_run, files_list, runobj) # feb 25 2019 removed, because didn't work on grendel: # Use the allslots pe and all available slots on that cluster # #$ -pe allslots %s def create_job_array_script(self, script_file_name_base, command_line, dir_to_run, files_list): # sge_slot_number = self.get_sge_slot_number() sge_cluster_name = self.get_sge_cluster_name() sge_slot_number = self.cluster_slots[sge_cluster_name][0] logger.debug("sge_slot_number FROM create_job_array_script = %s" % (sge_slot_number)) files_string = " ".join(files_list) files_list_size = len(files_list) # command_file_name = os.path.basename(command_line.split(" ")[0]) script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) log_file_name = script_file_name + ".sge_script.sh.log" email_mbl = C.email_mbl # self.utils.make_users_email() text = ( '''#!/bin/bash #$ -cwd #$ -S /bin/bash #$ -N %s # Giving the name of the output log file #$ -o %s # Combining output/error messages into one file #$ -j y # Send mail to these users #$ -M %s # Send mail at job end (e); -m as sends abort, suspend. #$ -m as # max_running_tasks #$ -tc 15 -# Use the allslots pe and all available slots on that cluster #$ -pe allslots %s #$ -t 1-%s # Now the script will iterate %s times. file_list=(%s) i=$(expr $SGE_TASK_ID - 1) echo "i = $i" . /bioware/root/Modules/etc/profile.modules module load bioware module load vsearch INFILE=${file_list[$i]} filename=$(basename $INFILE) echo "INFILE = $INFILE" filename_base="${filename%%.*}" echo "filename_base = $filename_base" echo "%s" echo "%s" %s %s ''' % (script_file_name, log_file_name, email_mbl, sge_slot_number, files_list_size, files_list_size, files_string, command_line[0], command_line[1], command_line[0], command_line[1]) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.utils.open_write_close(script_file_name_full, text) return script_file_name def create_not_SGE_script(self, script_file_name_base, command_line, dir_to_run, files_list): files_string = " ".join(files_list) script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) text = ( '''#!/bin/bash file_list=(%s) . /bioware/root/Modules/etc/profile.modules module load bioware module load vsearch n=0 for INFILE in "${file_list[@]}" do n=$[n + 1] echo $n echo "INFILE = $INFILE" filename=$(basename $INFILE) filename_base="${filename%.*}" echo "filename_base = $filename_base" echo "%s" echo "%s" %s %s done ''' % (files_string, command_line[0], command_line[1], command_line[0], command_line[1]) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.utils.open_write_close(script_file_name_full, text) return script_file_name def chimera_checking(self): chimera_region_found = False file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix) logger.debug("FFF = file_list = %s" % (file_list)) # TODO: method dna_region = list( set([ self.runobj.samples[idx_key].dna_region for idx_key in self.input_file_names ]))[0] if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) ref_db = self.get_ref_db(dna_region) command_line = self.create_chimera_cmd(ref_db) sh_script_file_name = self.create_job_array_script( "chimera_checking", command_line, self.indir, file_list) script_file_name_full = os.path.join(self.indir, sh_script_file_name) self.utils.call_sh_script(script_file_name_full, self.indir) self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir)) self.dirs.chmod_all(self.indir) logger.debug('sh_script_file_name: ' + sh_script_file_name) if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The vsearch commands were created") def get_chimeric_ids(self): ids = set() chimera_file_names = self.get_chimera_file_names(self.outdir) file_ratio = self.check_chimeric_stats() for file_name in chimera_file_names: # logger.debug("from get_chimeric_ids: file_name = %s" % file_name) if file_name.endswith(self.chimeric_suffix): both_or_denovo = self.get_chimeras_suffix( file_ratio, file_name) # TODO: run ones for each file_base = ".".join(file_name.split(".")[0:3]) (for txt and db) if file_name.endswith(both_or_denovo): file_name_path = os.path.join(self.outdir, file_name) self.utils.print_both("Get ids from %s" % file_name_path) read_fasta = fa.ReadFasta(file_name_path) ids.update(set(read_fasta.ids)) return ids def get_chimeras_suffix(self, file_ratio, file_name): """ use only de-novo (.txt) chimeric if check_chimeric_stats shows ratio ref to de-novo > 3 e.g. if denovo_only: chimeric_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix if no: chimeras_suffix = self.chimeric_suffix if file_name.endswith(chimeric_suffix): ... # first_name, last_name = get_name() """ # for file_basename in file_ratio: (percent_ref, ratio) = file_ratio[".".join(file_name.split(".")[0:3])] chimeric_fa_suffix = "" # logger.debug("percent_ref = %s, ratio = %s" % (percent_ref, ratio)) # if (percent_ref > 15) and (ratio > 2): if ratio > 3: chimeric_fa_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix else: chimeric_fa_suffix = self.chimeric_suffix return chimeric_fa_suffix def move_out_chimeric(self): chimeric_ids = self.get_chimeric_ids() for idx_key in self.input_file_names: fasta_file_path = os.path.join(self.indir, self.input_file_names[idx_key]) read_fasta = fa.ReadFasta(fasta_file_path) read_fasta.close() non_chimeric_file = fasta_file_path + self.nonchimeric_suffix non_chimeric_fasta = fa.FastaOutput(non_chimeric_file) fasta = fa.SequenceSource(fasta_file_path, lazy_init=False) while fasta.next(): if not fasta.id in chimeric_ids: non_chimeric_fasta.store(fasta, store_frequencies=False) non_chimeric_fasta.close() def check_chimeric_stats(self): all_lines_suffix = self.denovo_suffix # ".txt" or ".db, doesn't matter" chimera_ref_suffix = self.ref_suffix + self.chimeric_suffix #".db.chimeric.fa" chimera_denovo_suffix = self.denovo_suffix + self.chimeric_suffix # ".txt.chimeric.fa" filenames = self.get_basenames(self.get_current_filenames(self.outdir)) file_ratio = {} for file_basename in filenames: # logger.debug(file_basename) all_lines = 0 ref_lines = 0 denovo_lines = 0 ratio = 0 percent_ref = 0 percent_denovo = 0 all_lines_file_name = os.path.join( self.outdir, file_basename + all_lines_suffix) ref_lines_file_name = os.path.join( self.outdir, file_basename + chimera_ref_suffix) denovo_lines_file_name = os.path.join( self.outdir, file_basename + chimera_denovo_suffix) all_lines = int(self.wccount(all_lines_file_name) or 0) ref_lines = int(self.get_fa_lines_count(ref_lines_file_name) or 0) denovo_lines = int( self.get_fa_lines_count(denovo_lines_file_name) or 0) # denovo_lines = int(denovo_lines or 0) if (ref_lines == 0) or (all_lines == 0): file_ratio[file_basename] = (0, 0) continue else: percent_ref = self.percent_count(all_lines, ref_lines) if (denovo_lines == 0): file_ratio[file_basename] = ( percent_ref, percent_ref ) #use ref instead of ratio, because we are actually looking for a huge difference between ref and denovo (ref > 15 and denovo = 0) continue if (denovo_lines > 0): ratio = self.count_ratio(ref_lines, denovo_lines) percent_denovo = self.percent_count(all_lines, denovo_lines) file_ratio[file_basename] = (percent_ref, ratio) # percent_ref = int(percent_ref or 0) if (percent_ref > 15): self.utils.print_both("=" * 50) self.utils.print_both(file_basename) # logger.debug("all_lines_file_name = %s, ref_lines_file_name = %s, denovo_lines_file_name = %s" % (all_lines_file_name, ref_lines_file_name, denovo_lines_file_name)) self.utils.print_both( "all_lines = %s, ref_lines = %s, denovo_lines = %s" % (all_lines, ref_lines, denovo_lines)) self.utils.print_both("ratio = %s" % ratio) self.utils.print_both("percent_ref = %s, percent_denovo = %s" % (percent_ref, percent_denovo)) return file_ratio def get_basenames(self, filenames): file_basenames = set() for f in filenames: file_basename = ".".join(f.split(".")[0:3]) if file_basename.endswith(self.base_suffix): file_basenames.add(file_basename) return file_basenames def wccount(self, filename): return subprocess.check_output(['wc', '-l', filename]).split()[0] def count_ratio(self, ref_num, denovo_num): try: return float(ref_num or 0) / float(denovo_num or 0) except ZeroDivisionError: # logger.debug("There is no denovo chimeras to count ratio.") pass def get_fa_lines_count(self, file_name): # todo: use fastalib to get cnt? # return fa.SequenceSource(file_name, lazy_init = False).total_seq try: file_open = open(file_name) return len([l for l in file_open.readlines() if l.startswith('>')]) except IOError: e = sys.exc_info()[1] self.utils.print_both(e) return 0 # logger.error("%s\nThere is no such file: %s" % (e, file_name)) def percent_count(self, all_lines, chimeric_count): try: return float(chimeric_count or 0) * 100 / float(all_lines or 0) except ZeroDivisionError: # logger.error("There is no denovo chimeras to count ratio.") pass """ ----------------------------------------------------------------------------- For 454. not tested """ def chimera_denovo(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.idx_keys: input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key + '.chimera.denovo') #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".denovo.log") dna_region = self.runobj.samples[idx_key].dna_region logger.debug("dna_region = %s" % dna_region) if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue self.utils.print_both( "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)) # uchime_cmd = C.clusterize_cmd # uchime_cmd += " " # uchime_cmd += self.usearch_cmd # uchime_cmd += " --uchime " # uchime_cmd += input_file_name # uchime_cmd += " --uchimeout " # uchime_cmd += output_file_name # uchime_cmd += " --abskew " # uchime_cmd += self.abskew uchime_cmd = '' if self.use_cluster: uchime_cmd += C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_denovo " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd)) try: logger.info("chimera denovo command: " + str(uchime_cmd)) # subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.utils.print_both("chimera denovo command: " + str(uchime_cmd)) #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) self.utils.print_both("chimera denovo result: " + str(output[idx_key])) #self.utils.print_both("output[idx_key] = %s" % output[idx_key]) #if idx_key in output and len(output[idx_key].split()) > 1: #self.utils.print_both(output[idx_key].split()[2]) items = output[idx_key].split() if len(items) > 2: cluster_id_list.append(items[2]) except OSError: e = sys.exc_info()[1] self.utils.print_both( "Error: Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) else: print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) self.utils.print_both( "Error: Execution of %s failed: %s" % (uchime_cmd, e)) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') # ??? # for idx_key in output: # if len(output[idx_key]) > 50 or len(output[idx_key]) < 40: # return ('ERROR','uchime ref may have broken or empty', idx_key) # finally self.utils.print_both('Finished Chimera Denovo') if cluster_id_list: return ('SUCCESS', 'uchime ref seems to have been submitted successfully', cluster_id_list) else: return ('ERROR', 'uchime ref returned no cluster IDs', cluster_id_list) def chimera_reference(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.run_keys: dna_region = self.runobj.samples[idx_key].dna_region if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') output_file_name = os.path.join(self.outdir, idx_key + ".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) #out_file_name = self.prefix[idx_key] + ".chimeras.db" input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key + ".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd = '' if self.use_cluster: uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_ref " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name uchime_cmd += " -db " uchime_cmd += ref_db uchime_cmd += " -strand " uchime_cmd += "plus" logger.debug("uchime_ref_cmd = %s" % (uchime_cmd)) try: logger.info("vsearch version: " % (self.utils.get_vsearch_version)) logger.info("chimera reference command: " + str(uchime_cmd)) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) #logger.debug('outsplit',output[idx_key].split()[2]) cluster_id_list.append(output[idx_key].split()[2]) #logger.debug('Have %d bytes in output' % len(output)) #logger.debug('ref',idx_key,output,len(output)) if len(output[idx_key]) < 50 and len(output[idx_key]) > 40: logger.debug( idx_key + " uchime ref seems to have been submitted successfully" ) else: if self.use_cluster: print >> sys.stderr, "Error: uchime ref may be broke" self.utils.print_both( "Error: uchime ref may be broke") except OSError: e = sys.exc_info()[1] print >> sys.stderr, "Error: Execution of chimera_reference failed: %s" % ( uchime_cmd, e) self.utils.print_both( "Error: Execution of chimera_reference failed: %s" % (uchime_cmd, e)) raise if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') for idx_key in output: if (len(output[idx_key]) > 50 or len(output[idx_key]) < 40) and self.use_cluster: return ('ERROR', 'uchime ref may have broken or empty', idx_key) self.utils.print_both('Finished Chimera Reference') return ('SUCCESS', 'uchime ref seems to have been submitted successfully', cluster_id_list) def write_chimeras_to_deleted_file(self): for idx_key in self.run_keys: # open deleted file and append chimera to it # open and read both chimeras files: chimeras.db and chimeras.txt # hash to remove dupes chimera_deleted = {} denovo_file = os.path.join(self.outdir, idx_key + '.chimera.denovo') ref_file = os.path.join(self.outdir, idx_key + ".chimera.ref") # deleted file is in trimming dir for vampsuser deleted_file = os.path.join(self.indir, idx_key + ".deleted.txt") for file in [denovo_file, ref_file]: if os.path.isfile(file): fh = open(file, "r") # make a list of chimera deleted read_ids for line in fh.readlines(): lst = line.strip().split() id = lst[1].split(';')[0] chimera_yesno = lst[-1] if (chimera_yesno) == 'Y': chimera_deleted[id] = 'chimera' # open to append as trimming deletions are already there fh_del = open(deleted_file, "a") for id in chimera_deleted: fh_del.write(id + "\tChimera\n") fh_del.close()