def probabilities_blosum_62_2(): seqs = sequence.readFastaFile("./files/simple_seqs/simple_2.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum62LatestProbs, log_transform=True) return phmm
def durbin_blosum_50_2(): seqs = sequence.readFastaFile("./files/simple_seqs/durbin_2.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum50, log_transform=True) return phmm
def annotateThis(args): outputfile = output(args) if '.fa' in args.annotateFile or '.fasta' in args.annotateFile: outputfile = outputfile + '.csv' print("this is a FASTA file") annot = sequence.readFastaFile(args.annotateFile, sequence.Protein_Alphabet, ignore=True, parse_defline=False) with open(outputfile, 'w', newline='') as f: fieldnames = ['Name', 'Sequence', 'Annots'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for seq in annot: s = ''.join(seq.sequence) thewriter.writerow({ 'Name': seq.name, 'Sequence': s, args.annotateKeyword: args.annotateKeyword }) elif '.csv' in args.annotateFile: print("this is a CSV file") outputfile = outputfile + '.csv' with open(args.annotateFile, newline='') as f: reader = csv.reader(f) header = next(reader) nameCol = 0 seqCol = 0 dictionary = {} for h in range(len(header)): if header[h] == 'Name': nameCol = h elif header[h] == 'Sequence': seqCol = h for row in reader: dictionary[row[nameCol]] = row[seqCol] with open(outputfile, 'w', newline='') as f: fieldnames = ['Name', 'Sequence', 'Annots'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for name, seq in dictionary.items(): thewriter.writerow({ 'Name': name, 'Sequence': seq, 'Annots': args.annotateKeyword })
def two_col_62_2(): seqs = sequence.readFastaFile("./files/custom_seqs/2_col.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum62, log_transform=True) return phmm
def borodovsky_blosum_50_2(): seqs = sequence.readFastaFile("./files/simple_seqs/borodovsky.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.borodovsky_4_7, [profile1, profile2], sub_matrix.blosum62LatestProbs, log_transform=False) return phmm
def ox_104t17_1(): seqs = sequence.readFastaFile( "./files/qscore_corrections/ox_104t17_1.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.qscore_params, [profile1, profile2], sub_matrix.blosum62EstimatedWithX, log_transform=True) return phmm
def read(args): outputfile = output(args) orig_dict = {} if '.csv' in args.input: print("this is a CSV file") outputfile = outputfile + '.fa' with open(args.input, newline='') as f: reader = csv.reader(f) for row in reader: orig_dict[row[0]] = row[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.tab' in args.input or '.tsv' in args.input: print("this is a TAB/TSV file") outputfile = outputfile + '.fa' with open(args.input) as tsv: for line in csv.reader(tsv, dialect="excel-tab"): orig_dict[line[0]] = line[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.fa' in args.input or '.fasta' in args.input: print("this is a FASTA file") outputfile = outputfile + '.csv' db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) with open(outputfile, 'w', newline='') as f: fieldnames = ['Name', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for seq in db100: s = ''.join(seq.sequence) thewriter.writerow({'Name': seq.name, 'Sequence': s})
''' Created on 29/03/2014 @author: jacekrad ''' import sequence as seq ex5_filename = "sigpep_at.fa" ex6_filename = "lipmet_at.fa" ex7_filename = "ex7.fa" """ read sequences from questions 5 & 6 into corresponding lists """ sequences_q5 = seq.readFastaFile(ex5_filename) sequences_q6 = seq.readFastaFile(ex6_filename) print "Q5 sequence has ", len(sequences_q5), " entries" print "Q6 sequence has ", len(sequences_q6), " entries" ids_q5 = [] ids_q6 = [] for sequence in sequences_q5: ids_q5.append(sequence.name) for sequence in sequences_q6: ids_q6.append(sequence.name) common_ids = set(ids_q5).intersection(set(ids_q6)) print len(common_ids), " common matches found" """ save the common entries into a FASTA file as well as a dictionary
for i in range(len(calls)): # go through each position supported.append(calls[i] and diff[i] > 0) return supported def getScores(seq, index=0): """ Create a score list for a sequence by referencing the Chou-Fasman table. """ return [cf_dict[s.upper()][index] for s in seq] """ ------------------------------------------- Below is test code ------------------------------------------- """ # Read some protein sequence data prot = sequence.readFastaFile('prot2.fa', symbol.Protein_Alphabet) # read the secondary structure data for the proteins above (indices should agree) sstr = sequence.readFastaFile('sstr3.fa', symbol.DSSP3_Alphabet) #prot = [sequence.Sequence('PNKRKGFSEGLWEIENNPTVKASGY', symbol.Protein_Alphabet, '2NLU_r76')] #sstr = [sequence.Sequence('CCCCHHHHHHHHHHHCCCCCCCCCC', symbol.DSSP3_Alphabet, '2NLU_s76')] #prot = [sequence.Sequence("SEQSICQARAAVMVYDDANKKWVPAGGSTGFSRVHIYHHTGNNTFRVVGRKIQDHQVVIN" +\ # "CAIPKGLKYNQATQTFHQWRDARQVYGLNFGSKEDANVFASAMMHALEVLN", symbol.Protein_Alphabet, "1EVH")] #sstr = [sequence.Sequence("CEEEEEEEEEEEEEEECCCCEEEEHHHCCCCEEEEEEEECCCCEEEEEEEECCCCCEEEEEEE" +\ # "CCCCCCECCCCCEEEEECCCCEEEEEECCHHHHHHHHHHHHHHHHHHC", symbol.DSSP3_Alphabet, "1EVH")] tp = 0 # number of true positives (correctly identified calls) tn = 0 # number of true negatives (correctly missed no-calls) fp = 0 # number of false positives (incorrectly identified no-calls) fn = 0 # number of false negatives (incorrectly missed calls)
''' Created on 30/03/2014 Assessment question 5 Exercises 8 & 9 @author: jacekrad ''' import sequence as seq from collections import Counter from webservice import * sequences = seq.readFastaFile("mystery1.fa") all_ids = [] """ for all the IDs in the sequences found in mystery1.fa get a ID mapping from P_REFSEQ_AC to ACC and for each IS in the map (dictionary) add it to the list of all IDs """ for sequence in sequences: ids = idmap(sequence.name, 'P_REFSEQ_AC', 'ACC') for value in ids.values(): all_ids.append(value) """ get a list of unique IDs from all the IDs """ unique_ids = list(set(all_ids)) combined_GOterms = []
input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1 outvec = self.nn1.feedforward(input) d = prob.Distrib(self.outp_alpha) for k in range(len(outvec)): d.observe(self.outp_alpha[k], outvec[k]) predsyms[i + W / 2] = d.getmax() # use the symbol with the highest probability return sequence.Sequence(predsyms, self.outp_alpha) ################################################################################################################## # Example applications of ML methods including NNs and Naive Bayes for secondary structure prediction. # ################################################################################################################## if __name__=='__main__': # examples to run unless this module is merely "imported" import os, time os.chdir('/Users/mikael/workspace/binf/data') # Note you will need to change this to find your directory of choice prot = sequence.readFastaFile('prot2.fa', symbol.Protein_Alphabet) # proteins sstr = sequence.readFastaFile('sstr3.fa', symbol.DSSP3_Alphabet) # secondary structure of prot # separate training and test data prot_trn = prot[0::2] # even-numbered indices prot_tst = prot[1::2] # odd-numbered indices sstr_trn = sstr[0::2] # even-numbered indices sstr_tst = sstr[1::2] # odd-numbered indices W = 15 if __name__=='__main__': # NN (should read "__main__" for it to be executed on "Run") nHid = 30 nn = SeqNN(W, symbol.Protein_Alphabet, symbol.DSSP3_Alphabet, nHid, cascade = W) #nn.nn = ml.readNNFile('sstr3.nn') #print "Successfully loaded network" start = time.time() print nn.observeAll(prot_trn, sstr_trn, eta = 0.01, niter = 20)
def align_seqs(inpath, outpath, aln_type, params=parameters.basic_params, subsmat=sub_matrix.blosum62EstimatedWithX_dict, log_transform=True): print("params are") print(params) # Read sequences in seqs = sequence.readFastaFile(inpath, alphabet=Protein_Alphabet_wB_X_Z) print(len(seqs)) if len(seqs) == 2: aln_order = [("N0", [seqs[0].name, seqs[1].name])] else: # Calculate guide tree guide_tree = gt.get_guide_tree(seqs, random=False) print(guide_tree.ascii_art()) # Get the alignment order aln_order = gt.get_aln_order(guide_tree) # print (aln_order) print(aln_order) seq_dict = {x.name: x for x in seqs} # Predecessors start off blank predecessors = [{}, {}] # Create alignment in order from guide tree for node in aln_order: # Get the current node name and list of sequences under that node curr_node = node[0] curr_seqs = node[1] # List to store the aligned sequences in aligned = [] # While the node has sequences underneath yet to be aligned while curr_seqs: # Get a sequence seq = curr_seqs.pop() # Make it into a profile if it isn't one already if type(seq_dict[seq]) != aln_profile.AlignmentProfile: profile = aln_profile.AlignmentProfile([seq_dict[seq]]) else: profile = seq_dict[seq] # Add sequence to the aligned list aligned.append(profile) # if len(alns) > 1: # new_align = "-align-".join(alns) # alns = [] # alns.append(new_align) # If we have two profiles it is time to align if len(aligned) > 1: pair_hmm = load_params(params, aligned, subsmat, log_transform, predecessors) if aln_type == 'viterbi': pair_hmm.performViterbiAlignment(po=False) aligned_profile = pair_hmm.get_alignment( type_to_get='viterbi') elif aln_type == 'poviterbi': pair_hmm.performViterbiAlignment(po=True) aligned_profile = pair_hmm.get_alignment( type_to_get='viterbi') elif aln_type == 'mea': pair_hmm.performMEAAlignment(po=False) aligned_profile = pair_hmm.get_alignment(type_to_get='mea') elif aln_type == 'pomea': pair_hmm.performMEAAlignment(po=True) aligned_profile = pair_hmm.get_alignment(type_to_get='mea') # Clear the previous unaligned sequences aligned = [] # Add the aligned sequences aligned.append(aligned_profile) # print ('wowza') # print (aligned[0]) # print(aligned[0].predecessors) seq_dict[curr_node] = aligned[0] # print('alignment is ') # print(aligned_profile) with open(outpath, 'w') as outfile: outfile.write(str(aligned_profile)) return aligned_profile
sys.exit(2) FILENAME = None DISCOVER_MODE = False SCAN_MODE = False WORD_WIDTH = 8 PEAK_WIDTH = 100 PEAK_MARGIN = 100 MOTIF_ID = 'MA0112.2' JASPAR_FILE = 'JASPAR_matrices.txt' for o, a in optlst: if o == '-h': usage(sys.argv[0]) elif o == '-f': FILENAME = a elif o == '-d': DISCOVER_MODE = True elif o == '-w': WORD_WIDTH = int(a) elif o == '-p': PEAK_WIDTH = int(a) elif o == '-m': PEAK_MARGIN = int(a) elif o == '-s': SCAN_MODE = True; MOTIF_ID = a elif o == '-j': JASPAR_FILE = a if FILENAME == None: usage(sys.argv[0], "Filename not specified") sys.exit(3) seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN) if DISCOVER_MODE: print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)) countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) elif SCAN_MODE: scanMotifReport(seqs, MOTIF_ID) else: usage(sys.argv[0], "No run mode selected")
def motifSearch(args, motifs): names = dict() # Create a dictionary for each motif, # which associates columns with a match to the number of sequences with that match for s in args.input: if (args.type == 'a'): aln = sequence.readFastaFile(s, sequence.Protein_Alphabet, gappy=True, ignore=True, parse_defline=False) ali = sequence.Alignment(aln) else: ali = sequence.readFastaFile(s, sequence.Protein_Alphabet, ignore=True, parse_defline=False) dictionary = dict() for a in ali: T1 = False if 'T1' in a.info: seqName = a.name + "+T1=yes" T1 = True if 'T2' in a.info: seqName = a.name + "+T2=yes" elif T1 == False: seqName = a.name #print(seqName) #seqName = str(a).split(":")[0] seqSequence = str(a).split(":")[1].strip() thisset = set() for m in motifs: number = 0 for i in range(len(args.motif)): if str(m) == args.motif[i]: number = i if (args.type == 'a'): result = m.search(a, gappy=True) else: result = m.search(a) if (len(result) > 1): for r in result: #position, matched string, score motifStart, foundMotif, n = r addThis = ('motif' + str(number + 1) + ',' + str(motifStart) + ',' + str(foundMotif) + ',' + str(len(foundMotif) + motifStart)) thisset.add(addThis) elif (len(result) == 1): motifStart, foundMotif, n = str(result).split(",") addThis = ('motif' + str(number + 1) + ',' + str(motifStart[2:]) + ',' + str(foundMotif) + ',' + str(len(foundMotif) + int(motifStart[2:]))) thisset.add(addThis) else: pass thisset.add('Sequence,' + seqSequence + ', ' + ', ') dictionary[seqName] = thisset names[s] = dictionary #print(names) return (names, dictionary)
self.alignment = a return q def getForeground(self): """ Return the probability distributions for columns in the discovered alignment. """ return self.q def getBackground(self): """ Return the probability distributions for the background used in the discovery. """ return self.p # Example 1: Find the peroxisome targeting signal if __name__=='__main__0': import os os.chdir('/Users/mikael/workspace/binf/data/') # set to the directory where you keep your files seqs = sequence.readFastaFile('pex2.fa', symbol.Protein_Alphabet) W = 3 pseudo = prob.readDistrib('blosum62.distrib') gibbs = GibbsMotif(seqs, W) q = gibbs.discover(pseudo) p = gibbs.getBackground() # Let's display the results, i.e. the best matches to the found motif a = getAlignment(seqs, q, p) k = 0 for seq in seqs: print "%s \t%d \t%s" % (str(seq), a[k], seq[a[k]:a[k]+W]) k += 1 # save the motif in two files: one for the foreground distributions and one with the background prob.writeDistribs(q, 'pex2q.distrib')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--redundancy", nargs='*', help="List of redundancy levels", default=[90, 80, 70]) parser.add_argument("-t1", "--tier1", help="User's Tier1 sequences") parser.add_argument("-t2", "--tier2", help="User's Tier2 sequences") parser.add_argument("-ml", "--maxlength", help="Max length that the sequence can be", default=800) parser.add_argument("-e", "--eval", nargs='*', help="List of evalues", default=[1e-100, 1e-75, 1e-50, 1e-20, 1e-10, 1e-5]) args = parser.parse_args() tier2 = {} tier2_short = {} tier2_annots = { } # annotations that we want to include in the final dataset if args.tier2: print("tier2 sequences have been provided") if '.fa' in args.tier2 or '.fasta' in args.tier2: print("tier2 sequences are FASTA file") tier2db = sequence.readFastaFile(args.tier2, sequence.Protein_Alphabet, ignore=True, parse_defline=False) print(str(len(tier2_list)) + " sequences in tier2") tier2_list = {} # map from "long" name to actual entry tier2_map_short = {} # map from "short" name to entry for s in tier2db: tier2_list[s.name] = s tier2_map_short[sequence.parseDefline(s.name)[0]] = s else: print("Please provide FASTA file for tier-2") if args.tier1: tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset print("Tier-1 sequences have been provided") if '.fa' in args.tier1 or '.fasta' in args.tier1: print("Tier-1 sequences are provided as a FASTA file") tier1db = sequence.readFastaFile(args.tier1, sequence.Protein_Alphabet, ignore=True, parse_defline=False) tier1_list = {} for s in tier1db: tier1_list[s.name] = "".join(s.sequence) print("Tier-1 has " + str(len(tier1_list)) + " sequences") else: print("Please provide FASTA file for tier-1") db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db100_map = {} # map from "long" name to actual entry db100_map_short = {} # map from "short" name to entry for s in db100: db100_map[s.name] = s db100_map_short[sequence.parseDefline(s.name)[0]] = s print("Database has " + str(len(db100_map)) + " sequences") for rr in args.redundancy: rs = str(rr) os.system('cd-hit -i ' + args.input + ' -c 0.' + rs + ' -T 5 -o db' + rs + ' -d 0') selected = {} for rr in args.redundancy: selected[rr] = [] filename = 'db' + str(rr) + '.clstr' clusters = readCDHIT(filename) for c in clusters: picked_one = False shortest = None reviewed = None for name in clusters[c]: if name in db100_map: seq = db100_map[name] if shortest: if len(seq) < len(shortest) and not disqualified( seq, args): shortest = seq elif not disqualified(seq, args): shortest = seq if seq.name.startswith('sp|') and not disqualified( seq, args): reviewed = seq if name in tier1_list: #print("this one orig" + str(seq)) selected[rr].append(seq) picked_one = True else: pass #print('Did not find', name) # If no Tier-1, prefer "reviewed", then shortest length if not picked_one and reviewed: selected[rr].append(reviewed) elif not picked_one and shortest: selected[rr].append(shortest) for rr in args.redundancy: filename = 'db' + str(rr) + '.fa' sequence.writeFastaFile(filename, selected[rr]) for rr in args.redundancy: os.system('makeblastdb -dbtype prot -in db' + str(rr) + '.fa -out db-' + str(rr)) # for rr in args.redundancy: # for evalue in args.evalue: # result_file = "dataset-" + str(rr) + '-'+ str(evalue) # cmd1 = "blastp -db db-" + str(rr) + " -outfmt 3 -num_descriptions 20000 -num_alignments 0 -num_threads 5 -query " + args.tier1 + " -out " + result_file + ".txt -evalue " + str(evalue) # print(cmd1) # os.system(cmd1) grab = False for rr in args.redundancy: for evalue in args.eval: c = 0 tpsIdentifier = set([]) seqs = [] result_file = "dataset-" + str(rr) + '-' + str(evalue) f = open(result_file + '.txt', 'rt') for row in f: if row.startswith('Sequences'): grab = True continue if grab == True: if row.startswith('Lambda'): grab = False if not row.strip() == "": identifier = row.split(' ')[0] if identifier != "Lambda": tpsIdentifier.add(identifier) for name in tpsIdentifier: try: seq = db100_map[name] info = '' seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass sequence.writeFastaFile(result_file + ".fa", seqs) print(result_file + " has " + str(len(seqs)) + "sequences") print('Done') totalSeqCount = [] c = 0 for evalue in args.eval: for rr in args.redundancy: output = [] ev = str(evalue) ev = ev[1:] red = str(rr) result_file = "dataset-" + str(rr) + '-' + str(evalue) a = sequence.readFastaFile(result_file + '.fa', sequence.Protein_Alphabet, ignore=True, parse_defline=False) names = set([]) for s in a: names.add(s.name) tier1_cnt = 0 tier2_cnt = 0 seqs = [] for name in names: try: seq = db100_map[name] info = '' if name in tier1_list: tier1_cnt += 1 #info = seq.info + ' ' + tier1_annots[name] elif name in tier2: tier2_cnt += 1 #info = seq.info + ' ' + tier2_annots[name] seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass #print('Did not find', name) print('Processed', len(seqs), 'for', result_file, ' Tier-1:', tier1_cnt, ' Tier-2:', tier2_cnt) output = [ev, red, len(seqs)] totalSeqCount.append(output) plotSeqs(totalSeqCount)
''' Created on 30/03/2014 @author: jacekrad ''' import sequence as seq import util q6b_filename="q6b.fasta" util.searchAndSave("surface+protein+AND+organism:1280", q6b_filename) sequences = seq.readFastaFile(q6b_filename) print len(sequences), " total sequences" matched_sequences = [] for sequence in sequences: if "RAFKPS" in str(sequence.sequence): matched_sequences.append(sequence) """ print the final results """ print len(matched_sequences), " matched sequences:" for sequence in matched_sequences: print sequence
f.write('\n') f.close() if __name__ == '__main__': if len(sys.argv) != 4: print('Usage: evodiv <tree> <alignment> <nodes> where', \ "\n\t<tree> is a completely labelled Newick file of a phylogenetic tree", \ "\n\t<alignment> is a FASTA or Clustal file with a sequence for each label in tree", \ "\n\t<nodes> is a FASTA file with a sequence entry for each label for which a variability is determined", \ "\n\tVariability is saved to file with the sequence entry's name .txt") sys.exit(1) tree = phylo.readNewick(sys.argv[1]) try: seqs = sequence.readFastaFile(sys.argv[2], alphabet=sequence.Protein_wX, gappy=True) aln = sequence.Alignment(seqs) except: aln = sequence.readClustalFile(sys.argv[2], sequence.Protein_Alphabet_wX) tree.putAlignment(aln) select = sequence.readFastaFile(sys.argv[3], alphabet=sequence.Protein_wX, gappy=True) nodes = [] for selected in select: nodename = selected.name nodes.append(nodename) for nodename in nodes: node = tree.findLabel(nodename)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="FASTA file to query from", required=True) parser.add_argument("-q", "--query", help="Query FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--reference", help="Reference database ", default="uniprotkb") parser.add_argument("-o", "--output", help="Output path", default="matchmyseqs") args = parser.parse_args() seqDict = {} tier1seq = '' representative = '' fasta = {} seqsforCSV = {} progress = 0 tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset os.system('makeblastdb -dbtype prot -in ' + args.input + ' -out ' + args.database) db = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db_map = {} # map from "long" name to actual entry db_map_short = {} # map from "short" name to entry for s in db: db_map[s.name] = s db_map_short[sequence.parseDefline(s.name)[0]] = s print("Database size is " + str(len(db_map))) print( "Blast started, this might take a bit depending on your dataset size") os.system("blastp -db " + args.database + " -outfmt 3 -num_descriptions 1 -num_alignments 0 -query " + args.query + " -out query.txt") if args.reference == 'uniprotkb': os.system( "grep -e \"^[st][pr]|\" query.txt | cut -d\' \' -f1 > UniProt_query.tab" ) # Extract the resulting sequence identifiers repSeqNames = set([]) f = open('UniProt_query.tab', 'rt') for row in f: repSeqNames.add(sequence.parseDefline(row.strip())[0]) f.close() print(str(len(repSeqNames)), " representative sequences have been found") #Annot the representative sequences notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str(row).split("=")[1][:-2].strip()) elif len(row) > 0 and (row[0].startswith('tr|') or row[0].startswith('sp|')): representative = (str(row).split(" ")[0][2:].strip()) seqDict[querySeq] = representative elif args.reference == 'refseq': grab = False repSeqNames = set([]) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str( row[0]).split("=")[1][:-2].strip().split(" ")[0]) elif len(row) > 0 and row[0].startswith('Sequences'): grab = True continue elif grab == True: if len(row) > 0 and not row[0].strip() == "": representative = (row[0].split('.')[0] + "." + row[0].split('.')[1].split(" ")[0]) repSeqNames.add(representative) seqDict[querySeq] = representative grab = False #print(len(repSeqNames)) notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) print(len(repSeqNames), " representative sequences found for " + args.query) # done25 = False # done50 = False # done75 = False # for s,rep in seqDict.items(): # total = (len(seqDict)) # seq = (sequence.getSequence(rep,'uniprotkb')) # seqsforCSV[rep] = str(seq).split(":")[1].strip() # elem = rep + str(seq) # progress+=1 # if (progress/total)*100 > 25 and not done25: # print("25% done") # done25 = True # elif (progress/total)*100 > 50 and not done50: # print("50% done") # done50 = True # elif (progress/total)*100 > 75 and not done75: # print("75% done") # done75 = True faOut = args.output + '.fa' seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in seqsforCSV.items() ] sequence.writeFastaFile(faOut, seq_list) csvOut = args.output + '.csv' with open(csvOut, 'w', newline='') as f: fieldnames = ['Name', 'Representative', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for given, rep in seqDict.items(): thewriter.writerow({ 'Name': given, 'Representative': rep, 'Sequence': seqsforCSV[rep] })
JASPAR_FILE = "JASPAR_matrices.txt" for o, a in optlst: if o == "-h": usage(sys.argv[0]) elif o == "-f": FILENAME = a elif o == "-d": DISCOVER_MODE = True elif o == "-w": WORD_WIDTH = int(a) elif o == "-p": PEAK_WIDTH = int(a) elif o == "-m": PEAK_MARGIN = int(a) elif o == "-s": SCAN_MODE = True MOTIF_ID = a elif o == "-j": JASPAR_FILE = a if FILENAME == None: usage(sys.argv[0], "Filename not specified") sys.exit(3) seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN) if DISCOVER_MODE: print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) elif SCAN_MODE: scanMotifReport(seqs, MOTIF_ID) else: usage(sys.argv[0], "No run mode selected")
def run_qscore(name, aln_type, parameters, specific_files=None, save=False, outpath=""): base_dir = "./bench1.0/" + name in_dir = base_dir + "/in/" ref_dir = base_dir + "/ref/" out_dir = "./qscore_alignments/" + aln_type + "_" + name qscore_dict = defaultdict(dict) files = os.listdir(in_dir) file_count = 0 start_time = timeit.default_timer() now = datetime.now() dt_string = now.strftime("%Y/%m/%d_%H:%M") # Add trailing slash to output directory if it isn't there outpath = outpath + "/" if outpath[-1] != "/" else outpath param_name = f"t={parameters['tau']}e={parameters['epsilon']}d={parameters['delta']}x={parameters['emissionX']}y={parameters['emissionY']}" output_file = "./qscore_alignments/" + aln_type + "_" + name + param_name + ".csv" if os.path.exists(outpath + name + ".p"): curr_dict = pickle.load(open(outpath + name + ".p", "rb")) else: curr_dict = {param_name: {}} if os.path.exists(outpath + name + "_best.p"): best_dict = pickle.load(open(outpath + name + "_best.p", "rb")) else: best_dict = {} if os.path.exists(outpath + "time.p"): time_dict = pickle.load(open(outpath + "time.p", "rb")) else: time_dict = {} failures = [] with open(output_file, 'w+') as output: writer = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['Tool', 'Dataset', 'Name', 'Q', 'TC', 'M', 'C']) # If we don't already have a directory created to save the alignments, lets make one if not os.path.exists(out_dir): os.makedirs(out_dir) for file in files: failed = False if file != ".DS_Store": seqs = sequence.readFastaFile(in_dir + file, alphabet=Protein_Alphabet_wB_X_Z) for seq in seqs: if any(skip in seq.sequence for skip in aa_skip): print("failed on " + seq.name) failures.append(file) failed = True if not failed: qscore_dict[file] = defaultdict(dict) if not specific_files or file in specific_files: if param_name not in curr_dict: curr_dict[param_name] = {} # print (curr_dict) file_count += 1 single_time = timeit.default_timer() print(file) # change_params = {'tau': 0.000002, 'epsilon': 0.0001, 'delta': 0.0002, 'emissionX': 0.2, 'emissionY': # 0.2} # change_params = {'tau': 0.00000000002, 'epsilon': 0.000175, 'delta': 0.00031, 'emissionX': # 0.002, # 'emissionY': # 0.002} # # change_params = {'tau': 0.1, 'epsilon': 0.02, 'delta': 0.01, 'emissionX': # 0.5, # 'emissionY': # 0.5} # Update parameters using Baum Welch for seq_order in list(itertools.combinations(seqs, 2)): profiles = [ aln_profile.AlignmentProfile([x]) for x in seq_order ] # change_params = bw.runBaumWelch(parameters, profiles, aln_type) print(parameters) # print (change_params) aligned_profile = align.align_seqs( in_dir + file, out_dir + "/" + file + ".aln", aln_type=aln_type, params=parameters, subsmat=sub_matrix.blosum62EstimatedWithX_dict, log_transform=log_transform) process = subprocess.Popen( "qscore -test %s -ref %s -cline -modeler" % (out_dir + "/" + file + ".aln", ref_dir + file), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) out = process.communicate()[0] errcode = process.returncode print('running') print(errcode) scores = [ x.strip() for x in out.decode('utf-8').split(";")[2:] ] # scores = [x.split("=")[1] for x in scores] # print (aligned_profile) print(file) print('\nScores be') print(scores) for score in scores: score_type = score.split("=")[0].strip() score_value = score.split("=")[1].strip() qscore_dict[file][score_type] = score_value curr_dict[param_name][file] = (scores, aligned_profile) update_best_dict(best_dict, file, scores, param_name) if scores and "=" in scores[0]: writer.writerow([ aln_type + "_" + param_name + "_log=" + str(log_transform), name, file, scores[0].split("=")[1], scores[1].split("=")[1], scores[2].split("=")[1], scores[3].split("=")[1] ]) else: failures.append(file) # if file not in curr_dict[param_name].keys(): # curr_dict[param_name][file] = (scores, aligned_profile) # else: # curr_dict[param_name][file] = (scores, aligned_profile) # total_seconds = timeit.default_timer() - start_time single_seconds = timeit.default_timer() - single_time if save: pickle.dump( curr_dict, open(outpath + aln_type + "_" + name + ".p", "wb")) pickle.dump( best_dict, open( outpath + aln_type + "_" + name + "_best.p", "wb")) if save: if name in time_dict: if total_seconds < time_dict[name][0]: time_dict[name] = (total_seconds, dt_string) print("New best time - " + utilities.format_time(total_seconds)) else: time_dict[name] = (total_seconds, dt_string) print("New best time - " + utilities.format_time(total_seconds)) pickle.dump( time_dict, open(outpath + aln_type + "_" + "time.p", "wb")) print('These files failed ') print(failures) return qscore_dict
import mea_poa.parameters as parameters import mea_poa.sub_matrix as sub_matrix import itertools import mea_poa.alignment_profile as aln_profile import mea_poa.baum_welch as bw import sequence from sym import Alphabet Protein_Alphabet_wB_X_Z = Alphabet('ABCDEFGHIKLMNPQRSTVWYXZ') alignment = 'Not calculated' po_alignment = 'Not calculated' # seq = "../../tests/files/simple_seqs/borodovsky.fasta" # seq = "../../tests/files/custom_seqs/tree_check.fasta" seqs = sequence.readFastaFile(seq, alphabet=Protein_Alphabet_wB_X_Z) change_params = { 'tau': 0.02, 'epsilon': 0.05, 'delta': 0.02, 'emissionX': 0.92, 'emissionY': 0.2 } change_params = { 'tau': 0.002, 'epsilon': 0.05, 'delta': 0.02, 'emissionX': 0.5, 'emissionY': 0.5