def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward): """BLAST all peptides against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True) if ".fasta.new" in f: cmd = ["blastall", "-p", blast, "-i", peptides, "-d", f, "-a", str(processors), "-e", "0.1", "-m", "8", "-F", str(filter), "-q", str(penalty), "-r", str(reward), "-o", "%s_blast.out" % f] subprocess.check_call(cmd) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward): """BLAST all peptides against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True) except: print "problem found in formatting genome %s" % f if ".fasta.new" in f: try: devnull = open('/dev/null', 'w') cmd = ["blastall", "-p", blast, "-i", peptides, "-d", f, "-a", str(processors), "-e", "0.1", "-m", "8", "-F", str(filter), "-q", str(penalty), "-r", str(reward), "-C", "F", "-o", "%s_blast.out" % f] subprocess.call(cmd, stdout=devnull, stderr=devnull) except: print "genomes %s cannot be used" % f results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward): """BLAST all peptides against each genome""" curr_dir = os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True) if ".fasta.new" in f: cmd = [ "blastall", "-p", blast, "-i", peptides, "-d", f, "-a", str(processors), "-e", "0.1", "-m", "8", "-F", str(filter), "-q", str(penalty), "-r", str(reward), "-o", "%s_blast.out" % f ] subprocess.check_call(cmd) results = set( p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blast_against_each_genome_tblastn(dir_path, processors, peptides): """BLAST all peptides against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) devnull = open("/dev/null", "w") files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True) except: print "problem found in formatting genome %s" % f if ".fasta.new" in f: try: devnull = open('/dev/null', 'w') cmd = ["tblastn", "-query", peptides, "-db", f, "-num_threads", str(processors), "-evalue", "0.1", "-outfmt", "6", "-out", "%s_blast.out" % f] subprocess.call(cmd, stdout=devnull, stderr=devnull) except: print "genomes %s cannot be used" % f results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def make_table(processors, test, clusters): """make the BSR matrix table""" curr_dir=os.getcwd() names = [ ] outdata = [ ] files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] lock = threading.Lock() def _perform_workflow(data): lock.acquire() tn, f = data """get the name of each of the files to be iterated""" name=[ ] out=get_seq_name(f) name.append(out) reduced=[ ] """remove the junk at the end of the file""" for x in name:reduced.append(x.replace('.fasta.new_blast.out.filtered.filtered.unique','')) names.append(reduced) my_dict={} file=open(f, "rU") #tmpfile=open("tmp.txt", "w") """make a dictionary of all clusters and values""" try: for line in file: fields=line.split() my_dict.update({fields[0]:fields[1]}) except: raise TypeError("abnormal number of fields") """add in values, including any potentially missing ones""" for x in clusters: if x not in my_dict.keys():my_dict.update({x:0}) """need to write a blank space""" for x in reduced: open("%s.tmp.matrix" % x, 'a').write('%s\n' % x) """sort keys to get the same order between samples""" od = collections.OrderedDict(sorted(my_dict.items())) newout = open("%s.tmp.matrix" % "".join(reduced), "a") for k,v in od.iteritems(): print >> newout,v if "T" in test: outdata.append(v) lock.release() results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) names_out = open("names.txt", "w") for x in names: print >> names_out, "".join(x) """this makes sure that the ref.list file is in the same order as the tmp matrix""" nr_sorted=sorted(clusters) open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) if "T" in test: myout=[x for i, x in enumerate(outdata) if x not in outdata[i+1:]] return sorted(outdata) else: pass names_out.close()
def predict_genes(fastadir, processors): """simple gene prediction using Prodigal in order to find coding regions from a genome sequence""" os.chdir("%s" % fastadir) files = os.listdir(fastadir) files_and_temp_names = [(str(idx), os.path.join(fastadir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data subprocess.check_call("prodigal -i %s -d %s_genes.seqs -a %s_genes.pep > /dev/null 2>&1" % (f, f, f), shell=True) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def new_loop(to_iterate, processors, clusters, debug): names = [] table_list = [] def _perform_workflow(data): tn, f = data name,values=make_table_dev(f, "F", clusters) names.append(name) table_list.append(values) if debug == "T": logging.logPrint("sample %s processed" % f) else: pass set(p_func.pmap(_perform_workflow, to_iterate, num_workers=processors)) return names,table_list
def blat_against_each_genome(dir_path,database,processors): """BLAT all genes against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("blat -out=blast8 -minIdentity=75 %s %s %s_blast.out > /dev/null 2>&1" % (f,database,f), shell=True) except: print "genomes %s cannot be used" % f results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blat_against_each_genome(dir_path,database,processors): """BLAT all genes against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("blat -out=blast8 -minIdentity=75 %s %s %s_blast.out > /dev/null 2>&1" % (f,database,f), shell=True) except: print("genomes %s cannot be used" % f) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def main(directory, reference, fastas, trunc, new, processors): curr_dir = os.getcwd() ordered = get_gene_order(reference) file_dir = glob.glob(os.path.join(fastas, "*fasta")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(file_dir)] def _perform_workflow(data): tn, f = data run_blast(directory, f) results = set( p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) print "blast finished!" genome_names = [] for infile in glob.glob(os.path.join(fastas, '*.fasta')): name = get_seq_name(infile) genome_names.append(name) process_results(genome_names, ordered, trunc, new, reference) subprocess.check_call("rm *.out", shell=True)
def blast_against_each_genome_blastn(dir_path, processors, filter, peptides, penalty, reward): """BLAST all peptides against each genome""" if "F" in filter: my_seg = "yes" else: my_seg = "no" curr_dir=os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True) except: print "problem found in formatting genome %s" % f if ".fasta.new" in f: devnull = open('/dev/null', 'w') try: cmd = ["blastn", "-query", peptides, "-db", f, "-dust", str(my_seg), "-num_threads", str(processors), "-evalue", "0.1", "-outfmt", "6", "-penalty", str(penalty), "-reward", str(reward), "-out", "%s_blast.out" % f] subprocess.call(cmd, stdout=devnull, stderr=devnull) except: print "The genome file %s was not processed" % f results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blast_against_each_genome_tblastn(dir_path, processors, peptides, filter): """BLAST all peptides against each genome""" curr_dir=os.getcwd() files = os.listdir(curr_dir) devnull = open("/dev/null", "w") if "T" in filter: my_seg = "yes" else: my_seg = "no" files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True) except: print("problem found in formatting genome %s" % f) if ".fasta.new" in f: try: devnull = open('/dev/null', 'w') cmd = ["tblastn", "-query", peptides, "-db", f, "-seg", my_seg, "-comp_based_stats", "F", "-num_threads", "1", "-evalue", "0.1", "-outfmt", "6", "-out", "%s_blast.out" % f] subprocess.call(cmd, stdout=devnull, stderr=devnull) except: print("genomes %s cannot be used" % f) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward): """BLAST all peptides against each genome""" curr_dir = os.getcwd() files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if ".fasta.new" in f: try: subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True) except: print "problem found in formatting genome %s" % f if ".fasta.new" in f: try: devnull = open('/dev/null', 'w') cmd = [ "blastall", "-p", blast, "-i", peptides, "-d", f, "-a", str(processors), "-e", "0.1", "-m", "8", "-F", str(filter), "-q", str(penalty), "-r", str(reward), "-C", "F", "-o", "%s_blast.out" % f ] subprocess.call(cmd, stdout=devnull, stderr=devnull) except: print "genomes %s cannot be used" % f results = set( p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def make_table(processors): """make the BSR matrix table""" clusters=[ ] curr_dir=os.getcwd() """I only use this loop to grab names...combine with next loop? I need the nr values before the next loop""" for infile in glob.glob(os.path.join(curr_dir, "*.filtered.unique")): file=open(infile, "rU") for line in file: fields=line.split() if fields[0] not in clusters: clusters.append(fields[0]) """de-replicate the clusters""" nr=[x for i, x in enumerate(clusters) if x not in clusters[i+1:]] names = [ ] outdata = [ ] files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] lock = threading.Lock() def _perform_workflow(data): lock.acquire() tn, f = data """get the name of each of the files to be iterated""" name=[ ] out=get_seq_name(f) name.append(out) reduced=[ ] """remove the junk at the end of the file""" for x in name:reduced.append(x.replace('.fasta.new_blast.out.filtered.filtered.unique','')) names.append(reduced) dict={} file=open(f, "rU") tmpfile=open("tmp.txt", "w") """make a dictionary of all clusters and values""" try: for line in file: fields=line.split() dict.update({fields[0]:fields[1]}) except: raise TypeError("abnormal number of fields") cluster_names={} """add in values, including any potentially missing ones""" for k,v in dict.iteritems(): if k in nr: cluster_names.update({k:v}) for x in nr: if x not in dict.keys():cluster_names.update({x:0}) """need to write a blank space""" for x in reduced: open("%s.tmp.matrix" % x, 'a').write('%s\n' % x) """sort keys to get the same order between samples""" for key in sorted(cluster_names.iterkeys()): for x in reduced: open("%s.tmp.matrix" % x, 'a').write("%s\n" % cluster_names[key]) outdata.append(cluster_names[key]) lock.release() results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) names_out = open("names.txt", "w") for x in names: print >> names_out, "".join(x) nr_sorted=sorted(nr) open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) return outdata, nr_sorted
def find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors): curr_dir=os.getcwd() my_dict_o = {} dup_dict = {} paralogs = [ ] duplicate_file = open("duplicate_ids.txt", "w") paralog_file = open("paralog_ids.txt", "w") ref_file = open("dup_refs.txt", "w") genome_specific_list_of_lists = [] target_list = [] files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if "_blast.out" in f: genome_specific_dict = {} name = get_seq_name(f) reduced_name = name.replace(".fasta.new_blast.out","") genome_specific_dict.update({"ID":reduced_name}) outfile = open("%s.counts.txt" % reduced_name, "w") try: for line in open(f, "U"): fields = line.split() if fields[0] not in ref_scores: pass elif float(fields[2])>=int(min_hlog) and (float(fields[11])/float(ref_scores.get(fields[0])))>=float(length): try: my_dict_o[fields[0]].append(fields[11]) genome_specific_dict[fields[0]].append(fields[11]) except KeyError: my_dict_o[fields[0]] = [fields[11]] genome_specific_dict[fields[0]] = [fields[11]] else: continue except: raise TypeError("problem parsing %s" % f) new_dict = {} for k,v in genome_specific_dict.iteritems(): for cluster in clusters: if k == "ID": pass elif k == cluster: try: new_dict.update({k:len(v)}) except: new_dict.update({k:"0"}) for cluster in clusters: if cluster not in genome_specific_dict: new_dict.update({cluster:"0"}) od = collections.OrderedDict(sorted(new_dict.items())) ids = OrderedDict({"ID":reduced_name}) both =OrderedDict(list(ids.items())+list(new_dict.items())) for k,v in both.iteritems(): outfile.write(str(v)+"\n") if k in target_list: pass else: target_list.append(k) outfile.close() results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) ref_file.write("\n".join(target_list)+"\n") ref_file.close() """known issue - if gene id is Capital and before "I", there can be a shuffling of IDs I need to sort the dictionary and keep the first item constant as ID""" try: generate_dup_matrix() os.system("paste dup_refs.txt dup_values > dup_matrix.txt") except: print("problem generating duplicate matrix, but we'll continue") for k,v in my_dict_o.iteritems(): if int(len(v))>=2: dup_dict.update({k:v}) for k,v in dup_dict.iteritems(): max_value = max(v) for x in v: if float(x)/float(max_value)<=max_plog: paralogs.append(k) else: continue for k, v in dup_dict.iteritems(): duplicate_file.write(k+"\n") nr=[x for i, x in enumerate(paralogs) if x not in paralogs[i+1:]] paralog_file.write("\n".join(nr)+"\n") duplicate_file.close() paralog_file.close() return nr, dup_dict
def main(directory, genes, blast, processors, remove_gap, keep): dependencies = ['blastall','formatdb','muscle'] for dependency in dependencies: ra = subprocess.call(['which', '%s' % dependency]) if ra == 0: pass else: print "%s is not in your path, but needs to be!" % dependency sys.exit() start_dir = os.getcwd() ap=os.path.abspath("%s" % start_dir) dir_path=os.path.abspath("%s" % directory) try: os.makedirs('%s/to_extract_xxx' % ap) os.makedirs('%s/work_xxx' % ap) except: os.system("rm -rf %s/to_extract_xxx" % ap) os.system("rm -rf %s/work_xxx" % ap) os.makedirs('%s/to_extract_xxx' % ap) os.makedirs('%s/work_xxx' % ap) gene_path=os.path.abspath("%s" % genes) os.system("cp %s %s/to_extract_xxx/genes.fasta" % (gene_path,ap)) os.chdir("%s/to_extract_xxx" % ap) split_multifasta("genes.fasta") os.system("rm genes.fasta") os.chdir("%s/work_xxx" % ap) """create combined file""" num_genomes, names = combined_seqs(dir_path) os.system("formatdb -i combined.seqs -p F") table_files = glob.glob(os.path.join("%s/to_extract_xxx" % ap, "*.fasta")) files_and_temp_names = [(str(idx), os.path.join("%s/to_extract_xxx" % ap, f)) for idx, f in enumerate(table_files)] def _perform_workflow(data): tn, f = data name = run_blast(f, blast) parse_blast_xml_report("%s.blast.out" % name) parsed_blast_to_seqs("%s.blast.unique" % name) check_and_align_seqs("%s.extracted.seqs" % name, num_genomes) os.system("rm %s.blast.out %s.blast.unique %s.extracted.seqs" % (name,name,name)) set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) os.system("rm *.blast.out *.blast.unique *.extracted.seqs") pull_seqs(names) concatenate() os.system("cat *.concat > all.concat") os.system('sed "s/ //g" all.concat > tmp.concat') os.system("awk 'FNR>1' tmp.concat > all.concat") if remove_gap == "T": remove_gaps("all.concat") os.system("cp final_alignment.fasta %s" % ap) elif remove_gap == "F": os.system("cp all.concat %s/final_alignment.fasta" % ap) else: print "You have chosen an incorrect option for gap removal, choose from T or F" sys.exit() """finish up""" os.chdir("%s" % ap) if keep == "T": pass elif keep == "F": os.system("rm -rf %s/to_extract_xxx %s/work_xxx" % (ap,ap)) else: print "Illegal keep value selected, not doing anything" pass
except OSError, e: if e.errno != errno.EEXIST:raise if "NULL" != reduce: reduce_path=os.path.abspath("%s" % reduce) effective_jobs = int(int(memory)/8000) if effective_jobs <=1: effective_jobs = 1 effective_processors = int(int(processors)/effective_jobs) os.chdir("%s/work_directory" % start_dir) def _perform_workflow(data): #tn, f = data f = data print data run_single_loop(f[1],f[2],f[0],f[3],f[7],f[5],start_path,f[6],f[8],UGAP_PATH,TRIM_PATH,PICARD_PATH,PILON_PATH,f[10],f[11]) results = set(p_func.pmap(_perform_workflow, datasets, num_workers=effective_jobs)) if __name__ == "__main__": usage="usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-c", "--config", dest="config_file", help="config file that populates the UGAP single assembly", action="callback", callback=test_file, type="string") parser.add_option("-m", "--memory", dest="memory", help="amount of memory on the server, defaults to 48G, enter 48000", action="store", type="string", default="48000") options, args = parser.parse_args() mandatories = ["config_file"] for m in mandatories: if not options.__dict__[m]:
def run_loop(fileSets,error_corrector,processors,keep,coverage,proportion,start_path,reduce): #Is this still relevant? files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()] lock = threading.Lock() def _perform_workflow(data): idx, f = data if "NULL" not in reduce: try: subprocess.check_call("bwa index %s > /dev/null 2>&1" % reduce, shell=True) except: print "problems with indexing input file" sys.exit() try: run_bwa("%s" % f[0], "%s" % f[1], processors, idx,"%s" % reduce) os.system("samtools view -bS %s.sam > %s.bam 2> /dev/null" % (idx,idx)) os.system("bam2fastq -o %s#.fastq --no-aligned %s.bam > /dev/null 2>&1" % (idx,idx)) os.system("gzip %s_1.fastq %s_2.fastq" % (idx,idx)) os.system("cp %s_1.fastq.gz %s" % (idx,f[0])) os.system("cp %s_2.fastq.gz %s" % (idx,f[1])) except: print "problems depleting reads" sys.exit() else: pass if int(get_sequence_length(f[0], idx))<=200: args=['java','-jar','%s' % TRIM_PATH,'PE', '-threads', '%s' % processors, '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz', '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % UGAP_PATH, 'MINLEN:%s' % (int(get_sequence_length(f[0],idx)/2))] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') try: trim = Popen(args, stderr=vcf_fh, stdout=log_fh) trim.wait() except: log_isg.logPrint("problem encountered with trimmomatic") """assemble sequences with spades""" if error_corrector=="hammer": subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) elif error_corrector=="musket": ab = subprocess.call(['which', 'musket']) if ab == 0: pass else: print "musket isn't in your path, but needs to be!" sys.exit() subprocess.check_call("musket -k 17 8000000 -p %s -omulti %s -inorder %s.F.paired.fastq.gz %s.R.paired.fastq.gz > /dev/null 2>&1" % (processors,idx,idx,idx), shell=True) subprocess.check_call("mv %s.0 %s.0.musket.fastq.gz" % (idx,idx), shell=True) subprocess.check_call("mv %s.1 %s.1.musket.fastq.gz" % (idx,idx), shell=True) try: subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --only-assembler --careful -1 %s.0.musket.fastq.gz -2 %s.1.musket.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) except: pass else: try: subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --only-assembler --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) except: pass elif int(get_sequence_length(f[0], idx))>200: args=['java','-jar','%s' % TRIM_PATH,'PE', '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz', '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % UGAP_PATH, 'MINLEN:150'] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') try: trim = Popen(args, stderr=vcf_fh, stdout=log_fh) trim.wait() except: log_isg.logPrint("problem encountered with trimmomatic") """assemble sequences with spades""" if error_corrector=="hammer": try: subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) except: pass elif error_corrector=="musket": ab = subprocess.call(['which', 'musket']) if ab == 0: pass else: print "musket isn't in your path, but needs to be!" sys.exit() subprocess.check_call("musket -k 17 8000000 -p %s -omulti %s -inorder %s.F.paired.fastq.gz %s.R.paired.fastq.gz > /dev/null 2>&1" % (processors,idx,idx,idx), shell=True) subprocess.check_call("mv %s.0 %s.0.musket.fastq.gz" % (idx,idx), shell=True) subprocess.check_call("mv %s.1 %s.1.musket.fastq.gz" % (idx,idx), shell=True) try: subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --only-assembler --careful -1 %s.0.musket.fastq.gz -2 %s.1.musket.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) except: pass else: try: subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --only-assembler --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True) except: pass else: pass try: os.system("gzip -dc %s.F.paired.fastq.gz > %s_1.fastq" % (idx,idx)) os.system("gzip -dc %s.R.paired.fastq.gz > %s_2.fastq" % (idx,idx)) os.system("cp %s.spades/contigs.fasta %s.spades.assembly.fasta" % (idx,idx)) filter_seqs("%s.spades.assembly.fasta" % idx, keep, idx) """remove redundancies - will likely change in the near future""" os.system("%s/bin/psi-cd-hit.pl -i %s.%s.spades.assembly.fasta -o %s.%s.nr.spades.assembly.fasta -c 0.99999999 -G 1 -g 1 -prog blastn -exec local -l 500" % (UGAP_PATH,idx,keep,idx,keep)) clean_fasta("%s.%s.nr.spades.assembly.fasta" % (idx,keep),"%s_pagit.fasta" % idx) rename_multifasta("%s_pagit.fasta" % idx, idx, "%s_renamed.fasta" % idx) subprocess.check_call("bwa index %s_renamed.fasta > /dev/null 2>&1" % idx, shell=True) os.system("samtools faidx %s_renamed.fasta" % idx) run_bwa("%s_1.fastq" % idx, "%s_2.fastq" % idx, processors, idx,"%s_renamed.fasta" % idx) make_bam("%s.sam" % idx, idx) os.system("java -jar %s/CreateSequenceDictionary.jar R=%s_renamed.fasta O=%s_renamed.dict > /dev/null 2>&1" % (PICARD_PATH, idx, idx)) run_gatk("%s_renamed.fasta" % idx, processors, idx, "%s" % GATK_PATH) """run_bam_coverage stuff here""" os.system("java -jar %s/AddOrReplaceReadGroups.jar INPUT=%s_renamed.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (PICARD_PATH,idx,idx,idx,idx,idx)) os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx)) os.system("java -jar %s -R %s_renamed.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (GATK_PATH,idx,idx,idx)) process_coverage(idx) except: pass lock.acquire() try: to_fix=parse_vcf("%s.gatk.out" % idx, coverage, proportion) log_isg.logPrint("number of SNPs to fix in %s = %s" % (idx,len(to_fix))) if int(len(to_fix))>=1: try: fasta_to_tab("%s_renamed.fasta" % idx, idx) fix_assembly("%s.out.tab" % idx, to_fix, idx) os.system("cp %s_corrected_assembly.fasta %s_renamed.fasta" % (idx,idx)) except: print "error correction failed for some reason" else: pass except: pass lock.release() try: os.system("java -jar %s --genome %s_renamed.fasta --frags %s_renamed.bam --output %s_pilon > /dev/null 2>&1" % (PILON_PATH,idx,idx,idx)) rename_multifasta("%s_pilon.fasta" % idx, idx, "%s_final_assembly.fasta" % idx) os.system("prokka --prefix %s --locustag %s --compliant --mincontiglen %s --strain %s %s_final_assembly.fasta > /dev/null 2>&1" % (idx,idx,keep,idx,idx)) filter_seqs("%s_final_assembly.fasta" % idx, keep, idx) os.system("sed -i 's/\\x0//g' %s.%s.spades.assembly.fasta" % (idx,keep)) os.system("%s/cleanFasta.pl %s.%s.spades.assembly.fasta -o %s/UGAP_assembly_results/%s_final_assembly.fasta > /dev/null 2>&1" % (PICARD_PATH,idx,keep,start_path,idx)) os.system("cp coverage_out.txt %s/UGAP_assembly_results" % start_path) try: os.system("cp %s/*.* %s/UGAP_assembly_results" % (idx,start_path)) except: print "prokka not installed and annotation files were not copied over!" pass except: pass results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
def tree_loop(fasta_dict, combined, tree, parallel_workers, run_r, num_refs): def _temp_name(t, f): return t + '_' + f def _perform_workflow(data): tn, f = data outfile = open("%s.fasta" % tn, "w") outfile.write(">%s\n%s" % (tn,f)) outfile.close() logging.debugPrint(lambda : "Processing sequence: %s" % tn) blast_against_reference("%s.fasta" % tn, combined, _temp_name(tn, "blast_parsed.txt")) subprocess.check_call("sort -u -k 2,2 %s > %s" % (_temp_name(tn, "blast_parsed.txt"), _temp_name(tn, "blast_unique.parsed.txt")), shell=True) parsed_blast_to_seqs(_temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas")) check_and_align_seqs(_temp_name(tn, "seqs_in.fas"), num_refs, _temp_name(tn, "seqs_aligned.fas")) if os.path.isfile(_temp_name(tn, "seqs_aligned.fas")): """What if there are NO SNPs in a given region""" #try: subprocess.call(['mothur', '#filter.seqs(fasta=%s, soft=100, vertical=F)' % _temp_name(tn, "seqs_aligned.fas")], stdout=subprocess.PIPE) subprocess.check_call('sed "s/[^1]/0/g" %s | sed "s/0/2/g" | sed "s/1/0/g" | sed "s/2/1/g" > %s' % (_temp_name(tn, "seqs_aligned.filter"), _temp_name(tn, "mask.txt")), shell=True) split_read(_temp_name(tn, "mask.txt"),_temp_name(tn, "padded.txt")) sum_qual_reads(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt")) #except: # """This function was never created""" # write_poly_zeros(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt")) if "T" == run_r: name = get_seq_name("%s.fasta" % tn) subprocess.check_call("cat snps.r | R --slave --args %s %s.table %s.pdf 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), name, name), shell=True) os.system("mv %s.table ./R_output/%s.table.txt" % (name, name)) os.system("mv %s.pdf ./R_output/%s.plots.pdf" % (name, name)) else: pass subprocess.check_call("FastTree -nt -noboot %s > %s 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), _temp_name(tn, "tmp.tree")), shell=True) run_dendropy("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.RF"))) run_dendropy_euclidian("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.EU"))) get_contig_length("%s.fasta" % tn, _temp_name(tn, "length.txt")) thread_id = id(threading.current_thread()) thread_distance_file = str(thread_id) + '_distance.txt' parse_rf_file(_temp_name(tn, "tmp.RF"), thread_distance_file) thread_euclidian_file = str(thread_id) + "_euc_dist.txt" parse_rf_file(_temp_name(tn, "tmp.EU"), thread_euclidian_file) thread_name_file = str(thread_id) + '_name.txt' write_strip_name("%s.fasta" % tn, thread_name_file) polys_name_file = str(thread_id) + '_polys.txt' parse_poly_file(_temp_name(tn, "polys.txt"), polys_name_file) length_name_file = str(thread_id) + '_length.txt' parse_poly_file(_temp_name(tn, "length.txt"), length_name_file) try: subprocess.check_call("rm mothur*", shell=True, stderr=open(os.devnull, 'w')) except: pass subprocess.check_call(["rm", _temp_name(tn, "blast_parsed.txt"), "%s.fasta" % tn, _temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas"), _temp_name(tn, "seqs_aligned.fas"), _temp_name(tn, "tmp.tree"), _temp_name(tn, "tmp.RF"), _temp_name(tn, "tmp.EU"), _temp_name(tn, "mask.txt"), _temp_name(tn, "padded.txt"), _temp_name(tn, "polys.txt"), _temp_name(tn, "seqs_aligned.filter"), _temp_name(tn, "length.txt"), _temp_name(tn, "seqs_aligned.filter.fasta")]) return (thread_distance_file, thread_name_file, polys_name_file, length_name_file, thread_euclidian_file) else: subprocess.check_call(["rm", _temp_name(tn, "blast_parsed.txt"), "%s.fasta" % tn, _temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas")]) files_and_temp_names = [(str(idx), f) for idx, f in fasta_dict.iteritems()] results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=parallel_workers)) #I do this to make sure and remove any old files that are setting around subprocess.call("rm distance.txt name.txt polys.txt length.txt", shell=True, stderr=open(os.devnull, 'w')) for files in func.chunk(5, results): distances = [] names = [] polys = [] lengths = [] euc_dist = [] for value in files: if value: distances.append(value[0]) names.append(value[1]) polys.append(value[2]) lengths.append(value[3]) euc_dist.append(value[4]) if distances: subprocess.check_call("cat %s >> distance.txt" % " ".join(distances), shell=True) subprocess.check_call("cat %s >> name.txt" % " ".join(names), shell=True) subprocess.check_call("cat %s >> polys.txt" % " ".join(polys), shell=True) subprocess.check_call("cat %s >> length.txt" % " ".join(lengths), shell=True) subprocess.check_call("cat %s >> euc_dist.txt" % " ".join(euc_dist), shell=True) subprocess.check_call("rm %s" % " ".join(distances), shell=True) subprocess.check_call("rm %s" % " ".join(names), shell=True) subprocess.check_call("rm %s" % " ".join(polys), shell=True) subprocess.check_call("rm %s" % " ".join(lengths), shell=True) paste_files("name.txt","distance.txt","euc_dist.txt","polys.txt","length.txt","all_distances.txt")
def run_loop(fileSets, dir_path, reference, processors, gatk, ref_coords, coverage, proportion, matrix,ap,doc,tmp_dir,picard,trim_path,wgfast_path,trim): files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()] lock = threading.Lock() def _perform_workflow(data): """idx is the sample name, f is the file dictionary""" idx, f = data if os.path.isfile("%s.tmp.xyx.matrix" % idx): pass else: if len(f)>1: if "T" in trim: """paired end sequences - Hardcoded the number of processors per job to 2""" args=['java','-jar','%s' % trim_path,'PE', '-threads', '2', '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz', '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path, 'MINLEN:%s' % int(get_sequence_length(f[0])/2)] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') if os.path.isfile("%s.F.paired.fastq.gz" % idx): pass else: try: trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh) trim_cmd.wait() except: log_isg.logPrint('problem enountered trying to run trimmomatic') else: os.link(f[0], "%s.F.paired.fastq.gz" % idx) os.link(f[1], "%s.R.paired.fastq.gz" % idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: run_bwa(reference, '%s.F.paired.fastq.gz' % idx, '%s.R.paired.fastq.gz' % idx, processors, idx) else: if "T" in trim: """single end support""" args=['java','-jar','%s' % trim_path,'SE', '-threads', '2', '%s' % f[0], '%s.single.fastq.gz' % idx, 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path, 'MINLEN:%s' % int(get_sequence_length(f[0])/2)] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') if os.path.isfile("%s.single.fastq.gz" % idx): pass else: try: trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh) trim_cmd.wait() except: log_isg.logPrint("problem encountered with trimmomatic") else: os.link(f[0], "%s.single.fastq.gz" % idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: run_bwa(reference, '%s.single.fastq.gz' % idx, "NULL", processors, idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: process_sam("%s.sam" % idx, idx) """inserts read group information, required by new versions of GATK""" os.system("java -jar %s INPUT=%s.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (picard,idx,idx,idx,idx,idx)) os.system("samtools index %s_renamed_header.bam > /dev/null 2>&1" % idx) run_gatk(reference, processors, idx, gatk, tmp_dir) if "T" == doc: lock.acquire() os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx)) os.system("java -Djava.io.tmpdir=%s -jar %s -R %s/scratch/reference.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (tmp_dir,gatk,ap,idx,idx)) lock.release() process_coverage(idx) else: pass process_vcf("%s.vcf.out" % idx, ref_coords, coverage, proportion, idx) make_temp_matrix("%s.filtered.vcf" % idx, matrix, idx) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors))
effective_jobs = int(int(memory) / 8000) if effective_jobs <= 1: effective_jobs = 1 effective_processors = int(int(processors) / effective_jobs) os.chdir("%s/ugap_work_directory" % start_dir) keep_stuff = [] def _perform_workflow(data): f = data run_single_loop(f[1], f[2], f[0], f[3], f[7], f[4], start_path, f[6], f[8], UGAP_PATH, TRIM_PATH, PICARD_PATH, PILON_PATH, f[10], f[11]) keep_stuff.append(f[5]) results = set( p_func.pmap(_perform_workflow, datasets, num_workers=effective_jobs)) if "F" in keep_stuff: pass else: os.system("rm -rf ugap_work_directory") if __name__ == "__main__": usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option( "-c", "--config", dest="config_file", help="config file that populates the UGAP single assembly", action="callback",
def make_table(processors): """make the BSR matrix table""" clusters = [] curr_dir = os.getcwd() """I only use this loop to grab names...combine with next loop? I need the nr values before the next loop""" for infile in glob.glob(os.path.join(curr_dir, "*.filtered.unique")): file = open(infile, "rU") for line in file: fields = line.split() if fields[0] not in clusters: clusters.append(fields[0]) """de-replicate the clusters""" nr = [x for i, x in enumerate(clusters) if x not in clusters[i + 1:]] names = [] outdata = [] files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] lock = threading.Lock() def _perform_workflow(data): lock.acquire() tn, f = data """get the name of each of the files to be iterated""" name = [] out = get_seq_name(f) name.append(out) reduced = [] """remove the junk at the end of the file""" for x in name: reduced.append( x.replace('.fasta.new_blast.out.filtered.filtered.unique', '')) names.append(reduced) dict = {} file = open(f, "rU") tmpfile = open("tmp.txt", "w") """make a dictionary of all clusters and values""" try: for line in file: fields = line.split() dict.update({fields[0]: fields[1]}) except: raise TypeError("abnormal number of fields") cluster_names = {} """add in values, including any potentially missing ones""" for k, v in dict.iteritems(): if k in nr: cluster_names.update({k: v}) for x in nr: if x not in dict.keys(): cluster_names.update({x: 0}) """need to write a blank space""" for x in reduced: open("%s.tmp.matrix" % x, 'a').write('%s\n' % x) """sort keys to get the same order between samples""" for key in sorted(cluster_names.iterkeys()): for x in reduced: open("%s.tmp.matrix" % x, 'a').write("%s\n" % cluster_names[key]) outdata.append(cluster_names[key]) lock.release() results = set( p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) names_out = open("names.txt", "w") for x in names: print >> names_out, "".join(x) nr_sorted = sorted(nr) open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) return outdata, nr_sorted
def find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors): curr_dir=os.getcwd() my_dict_o = {} dup_dict = {} paralogs = [ ] duplicate_file = open("duplicate_ids.txt", "w") paralog_file = open("paralog_ids.txt", "w") ref_file = open("dup_refs.txt", "w") genome_specific_list_of_lists = [] target_list = [] ordered_target_list = [] files = os.listdir(curr_dir) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(files)] def _perform_workflow(data): tn, f = data if "_blast.out" in f: genome_specific_dict = {} name = get_seq_name(f) reduced_name = name.replace(".fasta.new_blast.out","") genome_specific_dict.update({"ID":reduced_name}) outfile = open("%s.counts.txt" % reduced_name, "w") try: for line in open(f, "U"): newline = line.strip() fields = newline.split() """Each blast query should be in the reference blast file""" if fields[0] not in ref_scores: print("potential problem found with BLAST File..") sys.exit() elif float(fields[2])>=int(min_hlog) and (float(fields[11])/float(ref_scores.get(fields[0])))>=float(length): try: my_dict_o[fields[0]].append(fields[11]) genome_specific_dict[fields[0]].append(fields[11]) except KeyError: my_dict_o[fields[0]] = [fields[11]] genome_specific_dict[fields[0]] = [fields[11]] else: continue except: raise TypeError("problem parsing %s" % f) new_dict = {} for k,v in genome_specific_dict.iteritems(): for cluster in clusters: if k == "ID": pass elif k == cluster: try: new_dict.update({k:len(v)}) except: new_dict.update({k:"0"}) for cluster in clusters: if cluster not in genome_specific_dict: new_dict.update({cluster:"0"}) """this is our ordered dictionary""" od = collections.OrderedDict(sorted(new_dict.items())) ids = OrderedDict({"ID":reduced_name}) both =OrderedDict(list(ids.items())+list(new_dict.items())) for k,v in both.iteritems(): if k == "ID": outfile.write(str(v)+"\n") for cluster in clusters: for k,v in both.iteritems(): if k == cluster: outfile.write(str(v)+"\n") outfile.close() results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) """Here's where I write to the reference file, which is the first column of dup_matrix.txt""" ref_file.write("ID"+"\n") ref_file.write("\n".join(clusters)+"\n") ref_file.close() try: generate_dup_matrix() os.system("paste dup_refs.txt dup_values > dup_matrix.txt") except: print("problem generating duplicate matrix, but we'll continue") for k,v in my_dict_o.iteritems(): if int(len(v))>=2: dup_dict.update({k:v}) for k,v in dup_dict.iteritems(): max_value = max(v) for x in v: if float(x)/float(max_value)<=max_plog: paralogs.append(k) else: continue for k, v in dup_dict.iteritems(): duplicate_file.write(k+"\n") nr=[x for i, x in enumerate(paralogs) if x not in paralogs[i+1:]] paralog_file.write("\n".join(nr)+"\n") duplicate_file.close() paralog_file.close() return nr, dup_dict
def main(directory, genes, blast, processors, remove_gap, keep): if blast == 'blastn': dependencies = ['blastn','makeblastdb','muscle'] else: dependencies = ['tblastn','makeblastdb','muscle'] for dependency in dependencies: ra = subprocess.call(['which', '%s' % dependency]) if ra == 0: pass else: print "%s is not in your path, but needs to be!" % dependency sys.exit() start_dir = os.getcwd() ap=os.path.abspath("%s" % start_dir) dir_path=os.path.abspath("%s" % directory) try: os.makedirs('%s/to_extract_xxx' % ap) os.makedirs('%s/work_xxx' % ap) except: os.system("rm -rf %s/to_extract_xxx" % ap) os.system("rm -rf %s/work_xxx" % ap) os.makedirs('%s/to_extract_xxx' % ap) os.makedirs('%s/work_xxx' % ap) gene_path=os.path.abspath("%s" % genes) os.system("cp %s %s/to_extract_xxx/genes.fasta" % (gene_path,ap)) os.chdir("%s/to_extract_xxx" % ap) split_multifasta("genes.fasta") os.system("rm genes.fasta") os.chdir("%s/work_xxx" % ap) """create combined file""" num_genomes, names = combined_seqs(dir_path) os.system("makeblastdb -in combined.seqs -dbtype nucl > /dev/null 2>&1") table_files = glob.glob(os.path.join("%s/to_extract_xxx" % ap, "*.fasta")) files_and_temp_names = [(str(idx), os.path.join("%s/to_extract_xxx" % ap, f)) for idx, f in enumerate(table_files)] def _perform_workflow(data): tn, f = data name = run_blast(f, blast) """This makes sure that there is only one sequence per genome""" os.system("sort -u -k 2,2 '%s.blast.out' > '%s.blast.unique'" % (name,name)) parsed_blast_to_seqs("%s.blast.unique" % name) check_and_align_seqs("%s.extracted.seqs" % name, num_genomes) os.system("rm '%s.blast.out' '%s.extracted.seqs'" % (name,name)) set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) pull_seqs(names) concatenate() os.system("cat *.concat > all.concat") os.system('sed "s/ //g" all.concat > tmp.concat') os.system("awk 'FNR>1' tmp.concat > all.concat") if remove_gap == "T": remove_gaps("all.concat") os.system("cp final_alignment.fasta %s" % ap) elif remove_gap == "F": os.system("cp all.concat %s/final_alignment.fasta" % ap) else: print "You have chosen an incorrect option for gap removal, choose from T or F" sys.exit() """finish up""" os.chdir("%s" % ap) if keep == "T": pass elif keep == "F": os.system("rm -rf %s/to_extract_xxx %s/work_xxx" % (ap,ap)) else: print "Illegal keep value selected, not doing anything" pass