def standardize_flanks(flank_file_path, index_dict, flank, genome_dict2): """Find the index position of the start and end of the DNA match in the sequences with flanks. If either flank is not as long as the flank setting, add N's to reach that number. If the index is -1 (not found), go back into the genome sequence to get the correct locus""" seq_dict = {} seq_order = [] modified = 0 flank_in = open(flank_file_path, "r") adj_flank_path = flank_file_path + "_adj" print "In standardize_flanks, flank =", flank base_path = os.path.splitext(flank_file_path)[0] genomic_path = base_path + ".genomic" genomic_out = open(genomic_path, "w") for title, seq in fastaIO.FastaGeneralIterator(flank_in): add_left = '' add_right = '' title = title.strip() seq_order.append(title) seq_dict[title] = seq strand = title.split("Direction:")[1] strand = strand.strip() seq_len = len(seq) contig = title.split("Sbjct:")[1].split(" ")[0] locus_str = title.split("Location:(")[1].split(" - ") start = int(locus_str[0]) end = int(locus_str[1].split(")")[0]) name = title #get genomic copy without flanks as PHI only report hit seq not genomic, then proceed genomic_seq = fastaIO.sequence_retriever(contig, start, end, 0, genome_dict2) if strand == 'minus': genomic_seq = fastaIO.reverse_complement(genomic_seq) print >> genomic_out, ">" + title + "\n" + genomic_seq if args.Type == 'nucl': name = title.split(' ')[0] if not seq_len or seq_len == 0: left_flank_len = -1 right_flank_index = -1 else: left_flank_len = seq.upper().find(index_dict[name]['left']) if left_flank_len < flank and left_flank_len != -1: retry = seq.upper().find(index_dict[name]['left'], flank - 20) if retry == flank: left_flank_len = retry right_flank_index = seq.upper().rfind(index_dict[name]['right']) right_flank_start = right_flank_index + 25 #first nt of flank, 1st nt after search string if right_flank_start > (seq_len - flank) and right_flank_index != -1: retry = seq.upper().rfind(index_dict[name]['right'], 0, (seq_len - flank) + 20) if retry == (seq_len - flank): right_flank_start = retry + 25 if left_flank_len == -1 or right_flank_index == -1: new_seq = fastaIO.sequence_retriever(contig, start, end, flank, genome_dict2) if strand == 'minus': new_seq = fastaIO.reverse_complement(new_seq) seq_dict[title] = new_seq modified += 1 continue right_flank_len = seq_len - right_flank_start if left_flank_len < flank: needed_left = flank - left_flank_len add_left = "N" * needed_left if right_flank_len < flank: needed_right = flank - right_flank_len add_right = "N" * needed_right if add_left or add_right: new_seq = add_left + seq + add_right seq_dict[title] = new_seq modified += 1 flank_in.close() genomic_out.close() if modified != 0: flank_out = open(adj_flank_path, "w") for title in seq_order: print >> flank_out, ">" + title + "\n" + seq_dict[title] flank_out.close() return (adj_flank_path) else: return (flank_file_path)
args = sys.argv[1:] def usage(): print """ usage: python summarize_protein_matches.py <nonredundant_pep_file> <output_file> """ sys.exit(-1) if len(args) != 2 or sys.argv[1] == '-h' or sys.argv[1] == '-help' or sys.argv[1] == '-H' or sys.argv[1] == '-Help' or sys.argv[1] == '--h' or sys.argv[1] == '--help': usage() track_dict = defaultdict(int) with open(sys.argv[1], "r") as f, open(sys.argv[2], "w", 1) as out: for title, seq in fastaIO.FastaGeneralIterator(f): hit_class = '' if "plus_" in title: hit_class = title.rsplit("plus_", 1)[1] elif "minus_" in title: hit_class = title.rsplit("minus_", 1)[1] if hit_class == "?": hit_class = "Undetermined" track_dict[hit_class] += 1 for key in track_dict: print>>out, key + "\t" + str(track_dict[key])
def runTarget(query, blast_out, blast_file_out, path): #make output directory os.mkdir(blast_out) #make command log file log_out = open(os.path.join(blast_out, "log.txt"), "w") print >> log_out, " ".join(sys.argv) log_out.close() #use blastn if DNA if args.Type == 'nucl': print "Using BLASTN\n" BLASTN(query, blast_file_out) #use tblastn if protein elif args.Type == 'prot': print "Using TBLASTN\n" TBLASTN(query, blast_file_out) #make svg drawing(s) print "Making svg image of blast results\n" Blast_draw(blast_file_out) #convert svg image to jpg print "Converting svg to jpg\n" for svg_file in glob.glob(str(blast_file_out) + "*.svg"): jpg_file = os.path.splitext(svg_file)[0] jpg_file = jpg_file + ".jpg" img_convert(svg_file, jpg_file) if args.S == 'Blast': return blast_in = str(blast_file_out) + ".blast" PHI_out = str(blast_file_out) print "Blast in:", blast_in + " PHI out:", PHI_out print "Running PHI" PHI(blast_in, PHI_out, query) print "PHI finished!\n" filter_list = str(blast_file_out) + ".list" #print "filter list path:", filter_list filter_path = os.path.join(path, "parse_target_list.py") #print "filter script path:", filter_path #print args.E if args.E == True: #print "E is true!" subp.call(["cp", filter_list, filter_list + "_ori.list"]) subp.call(["python", filter_path, filter_list, str(args.W)]) time.sleep(1) PHI_draw(filter_list + "_ori", Type) img_convert(filter_list + "_ori" + ".tcf_drawer.svg", filter_list + "_ori" + ".tcf_drawer.pdf") #make svg image of PHI homologs print "Making svg image of homologs\n" PHI_draw(PHI_out, Type) #convert svg to pdf print "Coverting svg image to pdf\n" img_convert( str(PHI_out) + ".tcf_drawer.svg", str(PHI_out) + ".tcf_drawer.pdf") #get query length query_in = open(query, "r") query_len = 0 for title, seq in fastaIO.FastaGeneralIterator(query_in): query_len = len(seq) query_in.close() #check that two or more copies were found and setup index checker to check for correct length flanks and for the correct locus sequence print "Building index dict" copies = 0 index_dict = defaultdict(dict) dna_copies_in = open(PHI_out + ".dna", "r") for title, seq in fastaIO.FastaGeneralIterator(dna_copies_in): #print "Seq_name:", title, "\nSeq_len:", len(seq), "\n" index_dict[title]['left'] = seq[:25].upper() index_dict[title]['right'] = seq[-25:].upper() dna_copies_in.close() flank_file_path = PHI_out + ".flank" genome_dict = {} genome_in = open(str(args.genome), "r") for title, seq in fastaIO.FastaGeneralIterator(genome_in): title2 = title.split(" ")[0] genome_dict[title2] = seq genome_in.close() flank_file_path = standardize_flanks(flank_file_path, index_dict, args.p_f, genome_dict) flank_copies_in = open(flank_file_path, "r") for title, seq in fastaIO.FastaGeneralIterator(flank_copies_in): copies += 1 flank_copies_in.close() if args.S == 'PHI': return if copies >= 2: filter_list = [] in_list = [] if args.a == 'hits' or args.a == 'both': print "hits will be aligned" if args.f > 0: print "Filtering flagged for hits\n" if args.Type == 'nucl': filter_list.append([ PHI_out + ".dna", PHI_out + ".dna_filter-" + str(args.f) ]) else: in_list.append(str(PHI_out) + ".aa") else: if args.Type == 'nucl': in_list.append(str(PHI_out) + ".dna") else: in_list.append(str(PHI_out) + ".aa") if args.a == 'flanks' or args.a == 'both': print "Flanks will be aligned" if args.f > 0: print "Filtering flagged for flanks\n" if args.Type == 'nucl': filter_list.append([ flank_file_path, PHI_out + ".flank_filter-" + str(args.f) ]) else: in_list.append(flank_file_path) else: in_list.append(flank_file_path) print "Entries in filter list: ", len(filter_list), "\n" if len(filter_list) != 0: #print "in_list =\n", in_list for in_path, out_path_base in filter_list: in_file = open(in_path, "r") out_path = out_path_base + "_under" out_path2 = out_path_base + "_over" out_file = open(out_path, "w") out_file2 = open(out_path2, "w") for title, seq in fastaIO.FastaGeneralIterator(in_file): copy_len = len(seq) - (int(args.p_f) * 2) if copy_len <= (query_len * args.f): print >> out_file, ">" + title + "\n" + seq else: print >> out_file2, ">" + title + "\n" + seq in_list.append(out_path) in_file.close() out_file.close() out_file2.close() #Run Mafft in_count = len(in_list) processed = 0 for in_path in in_list: split_list = [] in_file = open(in_path, "r") copies = 0 for title, seq in fastaIO.FastaGeneralIterator(in_file): copies += 1 in_file.close() print str(copies) + " copies in " + in_path, "\n" if copies >= 601: print "Shuffling and splitting file for seperate alignments\n" split_list, copies = fastaIO.shuffle_split(in_path, 350) print "Length of split list in:", len(split_list) print "Length of split list out:", len(split_list) if len(split_list) > 0: for path in split_list: msa_out = path + ".msa" print "Running Mafft" if args.Type == 'nucl': MAFFT_NT(path, msa_out) else: MAFFT_P(path, msa_out) if not os.path.exists(msa_out): print "MAFFT alignment failed and is most likely because of not enough RAM. Please rerun TARGeT on this query with increased RAM and/or fewer processors. TARGeT is now exiting." exit(1) processed += 1 if args.S == 'MSA': if (in_count - processed) == 0: return else: continue else: msa_out = in_path + ".msa" print "Running Mafft" if args.Type == 'nucl': MAFFT_NT(in_path, msa_out) else: MAFFT_P(in_path, msa_out) processed += 1 if not os.path.exists(msa_out): print "MAFFT alignment failed and is most likely because of not enough RAM. Please rerun TARGeT on this query with increased RAM and/or fewer processors. TARGeT is now exiting." exit(1) #print "in_count - processed = ", in_count - processed, "\n" if args.S == 'MSA': if (in_count - processed) == 0: return else: continue #Run FastTreeMP msa_list = glob.glob(in_path + "*.msa") print "FastTreeMP will run on", len(msa_list), " MSA(s)\n" c = 0 for msa_out in msa_list: print "Running FastTreeMP on MSA", c tree_out = msa_out + ".nw" print "Output tree path: ", tree_out, "\n\n" #Can only limit FastTree processor use through OMP_NUM_THREADS. Otherwise, it will use all processors available. current_env = os.environ.copy() #print "current_env before change: ", current_env current_env['OMP_NUM_THREADS'] = str(args.P) #print "OMP_NUM_THREADS after change: ", current_env['OMP_NUM_THREADS'], "\n\n" if args.Type == 'nucl' or (args.Type == 'prot' and ".flank" in msa_out): proc = subp.Popen([ "FastTreeMP", "-nt", "-gamma", "-out", tree_out, msa_out ], env=current_env) proc.wait() print "\nFastTreeMP finished.\n" else: proc = subp.Popen( ["FastTreeMP", "-gamma", "-out", tree_out, msa_out], env=current_env) proc.wait() print "\nFastTreeMP finished.\n" print "Converting output tree file to eps image\n" out = open(tree_out + ".eps", "w") #open output file for redirected stdout #print "Eps image out path: ", out if copies > 45: height = copies * 13 width = round(height / 3) print "Image height: ", height, "\twidth: ", width, "\n" subp.call([ "treebest", "export", "-y", str(height), "-x", str(width), "-b", "4.5", "-f", "13", "-m", "40", tree_out ], stdout=out) else: subp.call(["treebest", "export", tree_out], stdout=out) out.close() #close output file print "Coverting eps image to pdf\n" subp.call(["convert", tree_out + ".eps", tree_out + ".pdf"]) c += 1 else: print "Less than two copies found. Multiple alignment and tree building will not be performed.\n"
""" sys.exit(-1) bed_list = [] pat = re.compile( r"Sbjct:(.+)[_| ]Length.+Location:\(([0-9]*)[_|\s]*-[_|\s]*([0-9]*)\).*Direction:(.+)" ) for root, dirs, files in os.walk(sys.argv[1]): for filename in files: if fnmatch.fnmatch(filename, '*.flank'): fpath = os.path.join(root, filename) in_handle = open(fpath, "r") for title, seq in fastaIO.FastaGeneralIterator(in_handle): m = pat.search(title) if m: contig = m.group(1) if "_" in contig: contig = contig.split("_")[0] start = m.group(2) if int(start) > 0: start = str(int(start) - 1) end = m.group(3) strand = m.group(4) if strand == 'plus': strand = "+" elif strand == 'minus': strand = "-"
import os.path import fnmatch import fastaIO top = '''#!/bin/bash #PBS -l nodes=1:ppn=2,walltime=03:00:00 module load stajichlab module load stajichlab-python module load cd-hit python /rhome/bradc/cd-hit_protein_dna2.py ''' c = 1 for root, dirs, files in os.walk(sys.argv[1]): for filename in files: if fnmatch.fnmatch(filename, '*_fix.dna'): fpath = os.path.join(root, filename) if os.stat(fpath).st_size == 0: continue in_handle = open(fpath, "r") d = 0 for title, seq in fastaIO.FastaGeneralIterator(in_handle): d += 1 in_handle.close() if d < 2: continue full = top + root out_handle = open(sys.argv[2] + str(c) + ".sh", "w") print >> out_handle, full out_handle.close() c += 1
out_contents = [] #go through rest of the group output folder contents, seleting files to be combined out_contents = os.listdir(i) #print "out content length:", len(out_contents) for files in out_contents: if fnmatch.fnmatch(files, '*.fasta'): fpath = os.path.join(i, files) in_file = open(fpath, "r") for line in in_file: line = line.strip() fasta.append(line) in_file.close() in_file = open(fpath, "r") for title, seq in fastaIO.FastaGeneralIterator(in_file): #print "copy title:", title element_info_dict['total_len'] += len(seq) sanity_copy_count += 1 copy_dict[title] = 1 print "copy count now:", sanity_copy_count in_file.close() break out_contents = [] #go through rest of the group output folder contents, seleting files to be combined out_contents = os.listdir(i) for files in out_contents: for keys in element_info_dict['tsd_len']: if fnmatch.fnmatch(files, '*.insertion-site' + str(keys) + '.fa'):
if c < 4: print ' '.join(["Unknown:", name, new_name]) if name not in wanted_dict: wanted_dict[name] = new_name c += 1 no_match_in.close() print "\n" #for key in wanted_dict: # print key, " ", wanted_dict[key] if sys.argv[5] != "na": c = 0 #import hit sequence file hit_in = open(sys.argv[5], "r") hit_track = OrderedDict() for title, seq in fastaIO.FastaGeneralIterator(hit_in): if arg_len == 9: if "plus_" in title: title = title.rsplit("_plus", 1)[0] elif "minus_" in title: title = title.rsplit("_minus", 1)[0] else: if "plus_" in title: title = title.rsplit("plus_", 1)[0] + "plus" elif "minus_" in title: title = title.rsplit("minus_", 1)[0] + "minus" if c < 4: print "hit title:", title if title in wanted_dict: