def searchAndSave(searchString, filename): """ convenince function to search database and save results to a FASTA file. If a file of that name already exists then the whole process is skipped searchString - this is what is being searched for filename - name of the file where to save the results """ if not(path.isfile(filename)): sequences = [] """ set of ids returned as part of the protein """ ids = seq.searchSequences(searchString) print "Processing ", len(ids), " sequences ..." """ iterate over the ids and for each fetch the record from the database and append it to ex5_sequences """ for seq_id in ids: print "Fetching sequence: ", seq_id sequences.append(seq.getSequence(seq_id)) """ save the completed list of sequences to a FASTA file """ seq.writeFastaFile(filename, sequences) else: print filename, " exists. skipping."
def allMotifs_fa(args): #check hoow many cols, check if all of them has a value #make a FASTA and make a CSV for i in range (len(args.input)): c=0 fasta = {} name = (args.input[i]) name = name.split('.')[0] name = name + '_reduced.fa' with open(args.input[i], newline='') as f: reader = csv.reader(f) header = next(reader) for row in reader: isEmpty = False for i in range(1, len(header)-1): if row[i] == "": isEmpty = True break if isEmpty == False: fasta[row[0]] = row[len(header)-1] seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()] sequence.writeFastaFile(name, seq_list) c+=1 print(str(len(seq_list)) + " sequences kept after applying the requirements for " + name)
def read(args): outputfile = output(args) orig_dict = {} if '.csv' in args.input: print("this is a CSV file") outputfile = outputfile + '.fa' with open(args.input, newline='') as f: reader = csv.reader(f) for row in reader: orig_dict[row[0]] = row[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.tab' in args.input or '.tsv' in args.input: print("this is a TAB/TSV file") outputfile = outputfile + '.fa' with open(args.input) as tsv: for line in csv.reader(tsv, dialect="excel-tab"): orig_dict[line[0]] = line[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.fa' in args.input or '.fasta' in args.input: print("this is a FASTA file") outputfile = outputfile + '.csv' db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) with open(outputfile, 'w', newline='') as f: fieldnames = ['Name', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for seq in db100: s = ''.join(seq.sequence) thewriter.writerow({'Name': seq.name, 'Sequence': s})
def posEqual_fa(args): fasta = {} name = (args.input[0]) name = name.split('.')[0] csvFile = name + '_reduced.csv' seqCol = 0 with open(csvFile, newline='') as f: reader = csv.reader(f) header = next(reader) seqCol = len(header)-1 for row in reader: fasta[row[0]] = row[len(header)-1] seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()] sequence.writeFastaFile(name+'.fa', seq_list)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="FASTA file to query from", required=True) parser.add_argument("-q", "--query", help="Query FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--reference", help="Reference database ", default="uniprotkb") parser.add_argument("-o", "--output", help="Output path", default="matchmyseqs") args = parser.parse_args() seqDict = {} tier1seq = '' representative = '' fasta = {} seqsforCSV = {} progress = 0 tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset os.system('makeblastdb -dbtype prot -in ' + args.input + ' -out ' + args.database) db = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db_map = {} # map from "long" name to actual entry db_map_short = {} # map from "short" name to entry for s in db: db_map[s.name] = s db_map_short[sequence.parseDefline(s.name)[0]] = s print("Database size is " + str(len(db_map))) print( "Blast started, this might take a bit depending on your dataset size") os.system("blastp -db " + args.database + " -outfmt 3 -num_descriptions 1 -num_alignments 0 -query " + args.query + " -out query.txt") if args.reference == 'uniprotkb': os.system( "grep -e \"^[st][pr]|\" query.txt | cut -d\' \' -f1 > UniProt_query.tab" ) # Extract the resulting sequence identifiers repSeqNames = set([]) f = open('UniProt_query.tab', 'rt') for row in f: repSeqNames.add(sequence.parseDefline(row.strip())[0]) f.close() print(str(len(repSeqNames)), " representative sequences have been found") #Annot the representative sequences notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str(row).split("=")[1][:-2].strip()) elif len(row) > 0 and (row[0].startswith('tr|') or row[0].startswith('sp|')): representative = (str(row).split(" ")[0][2:].strip()) seqDict[querySeq] = representative elif args.reference == 'refseq': grab = False repSeqNames = set([]) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str( row[0]).split("=")[1][:-2].strip().split(" ")[0]) elif len(row) > 0 and row[0].startswith('Sequences'): grab = True continue elif grab == True: if len(row) > 0 and not row[0].strip() == "": representative = (row[0].split('.')[0] + "." + row[0].split('.')[1].split(" ")[0]) repSeqNames.add(representative) seqDict[querySeq] = representative grab = False #print(len(repSeqNames)) notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) print(len(repSeqNames), " representative sequences found for " + args.query) # done25 = False # done50 = False # done75 = False # for s,rep in seqDict.items(): # total = (len(seqDict)) # seq = (sequence.getSequence(rep,'uniprotkb')) # seqsforCSV[rep] = str(seq).split(":")[1].strip() # elem = rep + str(seq) # progress+=1 # if (progress/total)*100 > 25 and not done25: # print("25% done") # done25 = True # elif (progress/total)*100 > 50 and not done50: # print("50% done") # done50 = True # elif (progress/total)*100 > 75 and not done75: # print("75% done") # done75 = True faOut = args.output + '.fa' seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in seqsforCSV.items() ] sequence.writeFastaFile(faOut, seq_list) csvOut = args.output + '.csv' with open(csvOut, 'w', newline='') as f: fieldnames = ['Name', 'Representative', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for given, rep in seqDict.items(): thewriter.writerow({ 'Name': given, 'Representative': rep, 'Sequence': seqsforCSV[rep] })
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--redundancy", nargs='*', help="List of redundancy levels", default=[90, 80, 70]) parser.add_argument("-t1", "--tier1", help="User's Tier1 sequences") parser.add_argument("-t2", "--tier2", help="User's Tier2 sequences") parser.add_argument("-ml", "--maxlength", help="Max length that the sequence can be", default=800) parser.add_argument("-e", "--eval", nargs='*', help="List of evalues", default=[1e-100, 1e-75, 1e-50, 1e-20, 1e-10, 1e-5]) args = parser.parse_args() tier2 = {} tier2_short = {} tier2_annots = { } # annotations that we want to include in the final dataset if args.tier2: print("tier2 sequences have been provided") if '.fa' in args.tier2 or '.fasta' in args.tier2: print("tier2 sequences are FASTA file") tier2db = sequence.readFastaFile(args.tier2, sequence.Protein_Alphabet, ignore=True, parse_defline=False) print(str(len(tier2_list)) + " sequences in tier2") tier2_list = {} # map from "long" name to actual entry tier2_map_short = {} # map from "short" name to entry for s in tier2db: tier2_list[s.name] = s tier2_map_short[sequence.parseDefline(s.name)[0]] = s else: print("Please provide FASTA file for tier-2") if args.tier1: tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset print("Tier-1 sequences have been provided") if '.fa' in args.tier1 or '.fasta' in args.tier1: print("Tier-1 sequences are provided as a FASTA file") tier1db = sequence.readFastaFile(args.tier1, sequence.Protein_Alphabet, ignore=True, parse_defline=False) tier1_list = {} for s in tier1db: tier1_list[s.name] = "".join(s.sequence) print("Tier-1 has " + str(len(tier1_list)) + " sequences") else: print("Please provide FASTA file for tier-1") db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db100_map = {} # map from "long" name to actual entry db100_map_short = {} # map from "short" name to entry for s in db100: db100_map[s.name] = s db100_map_short[sequence.parseDefline(s.name)[0]] = s print("Database has " + str(len(db100_map)) + " sequences") for rr in args.redundancy: rs = str(rr) os.system('cd-hit -i ' + args.input + ' -c 0.' + rs + ' -T 5 -o db' + rs + ' -d 0') selected = {} for rr in args.redundancy: selected[rr] = [] filename = 'db' + str(rr) + '.clstr' clusters = readCDHIT(filename) for c in clusters: picked_one = False shortest = None reviewed = None for name in clusters[c]: if name in db100_map: seq = db100_map[name] if shortest: if len(seq) < len(shortest) and not disqualified( seq, args): shortest = seq elif not disqualified(seq, args): shortest = seq if seq.name.startswith('sp|') and not disqualified( seq, args): reviewed = seq if name in tier1_list: #print("this one orig" + str(seq)) selected[rr].append(seq) picked_one = True else: pass #print('Did not find', name) # If no Tier-1, prefer "reviewed", then shortest length if not picked_one and reviewed: selected[rr].append(reviewed) elif not picked_one and shortest: selected[rr].append(shortest) for rr in args.redundancy: filename = 'db' + str(rr) + '.fa' sequence.writeFastaFile(filename, selected[rr]) for rr in args.redundancy: os.system('makeblastdb -dbtype prot -in db' + str(rr) + '.fa -out db-' + str(rr)) # for rr in args.redundancy: # for evalue in args.evalue: # result_file = "dataset-" + str(rr) + '-'+ str(evalue) # cmd1 = "blastp -db db-" + str(rr) + " -outfmt 3 -num_descriptions 20000 -num_alignments 0 -num_threads 5 -query " + args.tier1 + " -out " + result_file + ".txt -evalue " + str(evalue) # print(cmd1) # os.system(cmd1) grab = False for rr in args.redundancy: for evalue in args.eval: c = 0 tpsIdentifier = set([]) seqs = [] result_file = "dataset-" + str(rr) + '-' + str(evalue) f = open(result_file + '.txt', 'rt') for row in f: if row.startswith('Sequences'): grab = True continue if grab == True: if row.startswith('Lambda'): grab = False if not row.strip() == "": identifier = row.split(' ')[0] if identifier != "Lambda": tpsIdentifier.add(identifier) for name in tpsIdentifier: try: seq = db100_map[name] info = '' seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass sequence.writeFastaFile(result_file + ".fa", seqs) print(result_file + " has " + str(len(seqs)) + "sequences") print('Done') totalSeqCount = [] c = 0 for evalue in args.eval: for rr in args.redundancy: output = [] ev = str(evalue) ev = ev[1:] red = str(rr) result_file = "dataset-" + str(rr) + '-' + str(evalue) a = sequence.readFastaFile(result_file + '.fa', sequence.Protein_Alphabet, ignore=True, parse_defline=False) names = set([]) for s in a: names.add(s.name) tier1_cnt = 0 tier2_cnt = 0 seqs = [] for name in names: try: seq = db100_map[name] info = '' if name in tier1_list: tier1_cnt += 1 #info = seq.info + ' ' + tier1_annots[name] elif name in tier2: tier2_cnt += 1 #info = seq.info + ' ' + tier2_annots[name] seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass #print('Did not find', name) print('Processed', len(seqs), 'for', result_file, ' Tier-1:', tier1_cnt, ' Tier-2:', tier2_cnt) output = [ev, red, len(seqs)] totalSeqCount.append(output) plotSeqs(totalSeqCount)
print "Q5 sequence has ", len(sequences_q5), " entries" print "Q6 sequence has ", len(sequences_q6), " entries" ids_q5 = [] ids_q6 = [] for sequence in sequences_q5: ids_q5.append(sequence.name) for sequence in sequences_q6: ids_q6.append(sequence.name) common_ids = set(ids_q5).intersection(set(ids_q6)) print len(common_ids), " common matches found" """ save the common entries into a FASTA file as well as a dictionary of id:sequence object map """ result_sequences = [] result_dictionary = {} for sequence in sequences_q5: if sequence.name in common_ids: result_sequences.append(sequence) result_dictionary[sequence.name] = sequence seq.writeFastaFile(ex7_filename, result_sequences) print "saved results to ", ex7_filename
print('Warning: ', nodename, 'has not got a sequence') all_children = tree.getDescendantsOf(node, transitive=True) if all_children == None: print('Warning: ', nodename, 'has no children') direct_ancestors = tree.getAncestorsOf(node, transitive=True) if direct_ancestors == None: print('Warning: ', nodename, 'has no ancestors') direct_ancestors = [] relevant = [node.sequence] for child in all_children: if not child.sequence: pass #print 'Warning: ', nodename, 'has a child', child.label, 'with a sequence which is None' else: allgaps = True for pos in child.sequence: if pos != '-': allgaps = False break if not allgaps: relevant.append(child.sequence) for parent in direct_ancestors: if not parent.sequence: print('Warning: ', nodename, 'has an ancestor', parent.label, 'with a sequence which is None') else: relevant.append(parent.sequence) relevant_aln = sequence.Alignment(relevant) saveConsensus(relevant_aln, countgaps=True, filename=nodename + ".txt") sequence.writeFastaFile(nodename + ".fa", relevant_aln.seqs)