#converts FASTQ files from NCBI to FASTA files #can take a file name or list of file names as an input #Last updated 2015.8.10 import sys #for helping with command line interface import dante #main dante functions import traceback #for helping with errors try: for file_name in dante.makeFileList(sys.argv): file_name_new = dante.makeNewFileName ("Desktop/Output", file_name, ".fasta") dante.printOutput(file_name_new) with open(file_name,'r') as f, open (file_name_new, 'w') as g: #goes through each line in f trigger = False for line in f: print line if line.isspace() == False: #write the next line if a header line occured previously if trigger: newFile.write(line) #if "@" header line, write to file if line[0] == "@": #change header information to fasta format #(send everything but the @ sign to spaceToBar g.write(">" +line[1:]) #set up so that the next line will automatically be written trigger = True
try: max_name_length = str(raw_input('What is the maximum name length allowed?:' )) #sets the maximum length of a name #Decide if file names are present as arguments or if the script needs to ask. if len(sys.argv) > 2: #the first argument in sys.argv is always the script name file_names = sys.argv[1:] #run the changeToFasta command on every file name if other commands are given tre_file = sys.argv[1] names_file = sys.argv[2] else: tre_file = str(raw_input('Enter the tree file to be changed: ')) names_file = str(raw_input('Enter the names file: ')) file_name_new = dante.makeNewFileName ("Desktop/Output", tre_file, ".namesfixed.tre") dante.printOutput(file_name_new) names_dict ={} #Key = short name. Value = long name. current_key = "" #Goes through every line of the names file. #Create a list of short names as well as a dictionary using the short name as the key and the full name as the value. with open (names_file, 'r') as f: key_switch = True for line in f: if key_switch: key_switch = False current_key = line.strip() #Creates a key to use in a dictionary. Removes newline from key. else:
#FASTA to PHYLIP + NAMES #This program simplifies names to 10 characters and creates a cross reference file. #It then returns a PHYLIP sequential formatted sequence file. #This program allows multiple file names to be called as arguments. #Last updated: 2015.8.12 import sys #for helping with command line interface import dante #main dante functions import traceback #for helping with errors try: for file_name in dante.makeFileList(sys.argv): file_clean = dante.fastaClean(file_name) file_name_phylip = dante.makeNewFileName ("Desktop/Output", file_clean, ".phylips") file_name_names = dante.makeNewFileName ("Desktop/Output", file_clean, ".dnames") with open(file_clean,'r') as f: #NEED TO GO THROUGH ALL THE LINES OF THE REFERENCE FILE AND WRITE THE HEADER LINE TO A NEW LIST #create phylip header (count number of sequences and the number of characters in first sequence) sequence_number = 0 character_number = 0 for line in f: if line [0] == ">": sequence_number = sequence_number + 1 else:
#Get search information from user rep_number = dante.getNumber("How many representatives do you want?") dante.log("How many representatives do you want?") dante.log(rep_number) database = str(raw_input("Which database do you want to search?")) dante.log("Which database do you want to search?") blast_db_format = 5 #exports hits as XML dante.log("Database format") dante.log(blast_db_format) #allows multiple inputs at command line or will ask for an input file #gets a list of fasta files for filename in dante.makeFileList(sys.argv): dante.log("Input Filename") dante.log(filename) file_name_xml = dante.makeNewFileName("Desktop/Output", filename, str(".BLAST." + database + ".top" + str(rep_number) + "hits.xml")) dante.log("Filename of XML file") dante.log(file_name_xml) file_name_summary = dante.makeNewFileName("Desktop/Output", file_name_xml,".summary.tsv") dante.log("Filename of Summary file") dante.log(file_name_summary) blastn_cline = NcbiblastnCommandline(remote=True, query=filename, db=database, outfmt=blast_db_format, out= file_name_xml, max_target_seqs =rep_number) dante.log(blastn_cline) stdout, stderr = blastn_cline() dante.BLASTSummary(file_name_xml, file_name_summary) print "_____Summary_____"
#REQUIRES BIOPYTHON TO BE INSTALLED #REQUIRES DANTE MODULE import sys #for helping with command line interface import dante #main dante functions import traceback #for helping with errors try: dante.log("Running BLAST_Summary.py") #allows multiple inputs at command line or will ask for an input file #gets a list of fasta files for filename in dante.makeFileList(sys.argv): file_name_new = dante.makeNewFileName ("Desktop/Output", filename,".summary.tsv") dante.log(str("Original File Name: " + filename)) dante.log(str("New File Name: " + file_name_new)) dante.BLASTSummary(filename, file_name_new) print "_____Summary_____" print "For filename: ", filename print "Output file: ", file_name_new except: traceback.print_exc(file=sys.stdout) dante.log(traceback.format_exc()) exit(0)
print ("NCBI databse requires an e-mail address. ") Entrez.email = dante.getEmail() #allows multiple inputs at command line or will ask for an input file for filename in dante.makeFileList(sys.argv): name_set = set() name_list=[] result_handle = open(filename,'r') #open the xml file for reading blast_records = NCBIXML.parse(result_handle) #parses the file to a blast_records object total = 0 for record in blast_records: #go through every record generated for alignment in record.alignments: name_set.add(alignment.title.split('|')[1]) #adds id number to set (removes duplicates) new_file = dante.makeNewFileName ('Desktop/Output', filename, "seqs.fasta") with open(new_file,'w') as f: for value in name_set: #walks through every id number #Biopython for retreving fasta files handle = Entrez.efetch(db="nucleotide", id=value, rettype="fasta", retmode="text") f.write(handle.read()) dante.log("Program Ran: BlastXMLtoFasta.py") dante.log("Input file: " + filename) dante.log("Output file:" + new_file) except: traceback.print_exc(file=sys.stdout) exit(0)
second_file_name = sys.argv[2] else: first_file_name = str(raw_input('Enter the file to be changed: ')) second_file_name = str(raw_input('Enter the reference file that contains the correct header names: ')) #NEED TO GO THROUGH ALL THE LINES OF THE REFERENCE FILE AND WRITE THE HEADER LINE TO A NEW LIST headerDict = {} with open (first_file_name, 'r') as f: for line in f: if line[0] == ">": headerDict[line.split('|')[1]] = line #open file using user supplied name new_file = dante.makeNewFileName ('Desktop/Output', second_file_name, ".namesfixed.fasta") with open(second_file_name,'r') as f, open(new_file,'w') as g: print "\n" print ("Output File Name: %s") %(new_file) print "\n" for line in f: #if the line is a new header line, write instead the top of the list flag = True if line[0] == ">": #Header lines in fasta format all start with '>' test_id = line.split('|')[1] #This only works on NCBI formatted files for key in headerDict: if str(test_id) == str(key): #tests id against key g.write(headerDict[key]) flag = False if flag: g.write(line)