def tamo2tf(TAMO_file): '''Converts TAMO files to the TRANSFAC format ''' ml = MotifTools.txt2motifs(TAMO_file) TAMO_file_name = TAMO_file.split("/")[-1] ACGT = ["A", "C", "G", "T"] n = 1 oup = open("%s.tf" % (TAMO_file), "w") for m in ml: if m.source == "": oup.write("DE\t%s_%s\t%s_%s\n" % (TAMO_file_name, n, TAMO_file_name, n)) else: oup.write("DE\t%s\t%s\n" % (m.source, m.source)) count = 0 #print m.source for i in range(m.width): oup.write("%s\t" % count) for letter in ACGT: if m.logP: Pij = pow(2.0, m.logP[i][letter]) oup.write("%s\t" % int(Pij * 100)) oup.write("\n") count += 1 oup.write("XX\n") n += 1 oup.close()
def parse_opts(): global GLOBALS global DFUNC, DMAX short_opts = 'm:' long_opts = ['dfunc:'] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() GLOBALS['args'] = args GLOBALS['motifs'] = [] DFUNCtxt = None for opt,value in opts: if opt == '-m': GLOBALS['motifs'] = MotifTools.txt2motifs(value) if opt == '--dfunc': DFUNCtxt = value if opt == '-d': DMAX = float(value) # Deal with DFUNC and DMAX if DFUNCtxt == 'NCB': _DFUNC = MotifCompare.negcommonbits elif DFUNCtxt: try: exec ("_DFUNC = MotifCompare.%s"%DFUNCtxt) except: usage("No such distance metric: %s"%DFUNCtxt) if _DFUNC: set_dfunc(_DFUNC,DMAX)
def pick_chunk_score(wdir, TAMO_file, target, genome): '''Trims and returns the top motif in a cluster. This script takes in the TAMO file from the motifs in a single cluster. It trims the low-information ends from each motifs. It then indentifies the motif that is most significantly represented in the target genes in your genome. If no motif is significantly represented, then a blank top motif file is created. ''' os.system("cd %s" % wdir) os.chdir(wdir) script_dir = '/'.join(os.path.abspath(__file__).split('/') [:-1]) # path to pcc_merge_CC.py script ## # step 1 trim tamo to eliminate low information flanking sequence trim_motif(TAMO_file, 0.1) ## # step 2 Group Specificity Score" from the Church lab # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs] # MotifMetrics.py checks if the motifs appear disproportionatly to the # targets compared to the rest of the genes. os.system( "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout" % (script_dir, target, genome, TAMO_file, TAMO_file)) ## # Gets the motif that is most significantly represented in your target genes # Returns "None" if none of the motifs has a p-value above 0.001. topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file) print "topm", topm ## # Writes the top motif to its own directory. if topm != "None": newdic = {} ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file) for m in ml: if m.oneletter == topm: newdic[m.oneletter] = m save_motifs(newdic.values(), "%s.TOP" % TAMO_file) os.system("rm %s_0.1.trim" % TAMO_file) os.system("rm %s_0.1.trim_Cout" % TAMO_file) ## # Writes a blank document if there was no top motif. else: oup = open("%s.TOP" % TAMO_file, "w") oup.close()
def TAMO_split(TAMO_file, motifs_per_file=190): '''This function splits a TAMO into smaller files for create_cc''' ml = MotifTools.txt2motifs(TAMO_file) total = len(ml) / int(motifs_per_file) # Total number of TAMOs to generate by = motifs_per_file for i in range(total): print i print i * by + by, TAMO_file + '_n%s' % i save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i) print total * by, len(ml), TAMO_file + '_n%s' % (total) save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total)) return (total)
def combine_distance_matrix_for_2(wdir, TAMO_file_1, TAMO_file_2): '''Combines matricies made from two TAMO files. This script is used to create the final matrix after all jobs from create_cc_for_2 are complete. ''' ml_1 = MotifTools.txt2motifs(TAMO_file_1) ml_2 = MotifTools.txt2motifs(TAMO_file_2) n_split_1 = len(ml_1) / 100 n_split_2 = len(ml_2) / 100 print n_split_1, len(ml_1) print n_split_2 # Change to the working directory. os.system("cd %s" % wdir) os.chdir(wdir) # This loop will paste together matricies for i in range(n_split_1 + 1): com = "paste " for j in range(n_split_2 + 1): com += "%s_n%s-%s_n%s.dm " % (TAMO_file_1, i, TAMO_file_2, j) com += "> distance_%s" % i print com os.system(com) # com = "cat " for i in range(n_split_1 + 1): com += "distance_%s " % i com += "> %s-%s.dm" % (TAMO_file_1, TAMO_file_2) print com os.system(com)
def parse_opts(): global GLOBALS short_opts = 'm:g:' long_opts = ['genome=','top='] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() GLOBALS['args'] = args for opt,value in opts: if opt == '-m': GLOBALS['motifs'] = MotifTools.txt2motifs(value) if opt in ['-g', '--genome']: GLOBALS['genomefile'] = value if opt == '--top': GLOBALS['top'] = int(value)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="]) # AD added 'i' except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7 ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) elif opt == '-i': motiffile = value # AD added this option to ACTUALLY supply the tamo motif file at the command-line. The code to deal with motiffiles already existed. There was just no code for User to supply one. probes = Fasta.load(fastafile) if motiffile: for f in motiffile.split(','): # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs motifs.extend(MotifTools.load(f)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # Get the directory where the script is located. script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been taken up by # the matrrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 cc_output = open('merge_runs_cc', 'w') if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the motifs that are in there own cluster. #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-distance %s" % distance print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been replaced by # the matrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # I am fairly certain that this process of converting to TF and # then returning it to TAMO format is only for keeping the names # consistent. I need to verify this suspicion tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) os.system("cat %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif in the cluster. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes the files that were created. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # Merges the motifs in the same cluster using STAMP tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) # Gets the JASPER motifs that best match the motifs from within # the cluster. os.system( "STAMP -tf %s_sub_%s.tm.tf -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores \ -go 1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log" % (TAMO_file, i, TAMO_file, i, TAMO_file, i)) parse_out_STAMP(TAMO_file, i) # combines the JASPER motifs with the cluster motif and then # converts them all to one TAMO file os.system( "cat %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif within the TAMO file. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes any files created in the processing. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the top motifs from every os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file, TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file)) os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" % (TAMO_file, TAMO_file, TAMO_file))
def combine_distance_matrix(wdir, TAMO_file): '''Combines the PCC score matricies and outputs them as a single matrix. Originaly written by Cheng Zou, and converted to a function by Alex Seddon. ''' ml = MotifTools.txt2motifs(TAMO_file) n_split = len(ml) / 100 ## # Change to the working directory. os.system("cd %s" % wdir) os.chdir(wdir) # ## ## # The following loop keeps counts the number of lines in the each of the # PCC matricies for a comparison of a TAMO file with itself. lendic = {} # Dictionary with the length of PCC matricies. for i in range(n_split + 1): lendic[i] = line_count("%s_n%s.dm" % (TAMO_file, i)) print lendic # ## ## # This loop creates files with blanks. The files are used to ensure that # the PCC-distance matrix is square. The blank files will be created to take # the place of files that would have been left blank for i in range(n_split + 1): for j in range(0, i): # open the file to add blanks oup = open("%s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, j), "w") print lendic[j], lendic[i] list = [] # Add a number of "-" to the list equal to the number of lines in # the self comparison files. for y in range(lendic[j]): list.append("-") for x in range(lendic[i]): oup.write("%s\n" % "\t".join(list)) oup.close() # ## ## # Creates a copy of the self comparison file so that it can be easily picked # out by the function. for i in range(n_split + 1): os.system("cp %s_n%s.dm %s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, i, TAMO_file, i)) # ## ## # This loop will look at each for i in range(n_split + 1): com = "paste " for j in range(n_split + 1): com += "%s_n%s-%s_n%s.dm " % (TAMO_file, i, TAMO_file, j) com += "> distance_%s" % i print com os.system(com) com = "cat " for i in range(n_split + 1): com += "distance_%s " % i com += "> %s.dm" % TAMO_file print com # Concatonate all the matricies os.system(com) # My embarisingly ad hoc way of removing double tabs remove_double_tabs("%s.dm" % TAMO_file)
threshold = math.pow(10, -float(sys.argv[2])) maxthreshold = float(sys.argv[3]) # for strong score, using 0.9*max score ATbias = float(sys.argv[4]) # 0.33 GCbias = float(sys.argv[5]) # 0.17 seq_file = sys.argv[6] # FASTA file of the sequence tar_dir = "" # Target directory for the output file # ## ## # for i in range(1, len(sys.argv)): if sys.argv[i] == "-d": tar_dir = sys.argv[i + 1].rstrip("/") print tar_dir ml = MotifTools.txt2motifs(file) n = 0 new_list = [] ## # Looks at each motif from the TAMO file. Uses the find function from # motility to find the sequences with that motif. for Ikey in range(len(ml)): #print m.ll time1 = time.time() m = ml[Ikey] # Pull out the motif from the motif list. save_motifs([m], file + '_' + str(Ikey)) # Save the motif as a file. ##
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="]) except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7) ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) probes = Fasta.load(fastafile) if motiffile: motifs.extend(TAMO.tamofile2motifs(motiffile)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))