def fix_mate_pairs(fq1, fq2, f_suffix="/1", r_suffix="/2"): """ takes two FASTQ files (fq1 and fq2) of paired end sequencing data and filters out reads without a mate pair. """ fq1_out = append_stem(fq1, "fixed") fq2_out = append_stem(fq2, "fixed") fq1_single = append_stem(fq1, "singles") fq2_single = append_stem(fq2, "singles") if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] f_dict = SeqIO.index(fq1, "fastq", key_function=get_read_name_function(f_suffix)) r_dict = SeqIO.index(fq2, "fastq", key_function=get_read_name_function(r_suffix)) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for key in f_dict: if key in r_dict: fq1_out_handle.write(f_dict.get_raw(key)) fq2_out_handle.write(r_dict.get_raw(key)) else: fq1_single_handle.write(f_dict.get_raw(key)) for key in r_dict: if key not in f_dict: fq2_single_handle.write(r_dict.get_raw(key)) return [fq1_out, fq2_out]
def parse_predicted_CDS_file(cds_file): """parse the cds file and index it. Take in the cds file and uses biopython to index it""" # this is for transdecoder names. May need to alter for other tools try: cds_database = SeqIO.index(cds_file, "fasta", key_function=strip_to_match_transcript_name) return cds_database except ValueError: outstr = ("WARNING: multi cds were predicted per transcript \n" + "\t- cannot change names. Going to pick the longest representative \n" + "\tcds per transcripts. Only do this if there are multiple cds \n" + "\tpredicted per transcript, otherwise message is not shown\n") logger.info(outstr) cds_database = SeqIO.index(cds_file, "fasta") # basically there are duplicates for each transcript. # So, find the longest representative and # create a new cds_database, based on that # call function longest_rep = os.path.join("temp_fix_five_prime", "longest_representative_seq.fasta") cds_database_new = find_longest_components(cds_file, cds_database, longest_rep) # return a seq_record object that can be # accessed in a dictionary like manner return cds_database_new
def extract_nbs(fi, fo, fd_cds, fd_pro, fo_pro, fn_cds, fn_pro): cds_dict = SeqIO.index(fd_cds, "fasta") pro_dict = SeqIO.index(fd_pro, "fasta") fhi = open(fi, "r") fho = open(fo, "w") header = fhi.readline().strip("\n") print >> fho, header (nbs_cds_rcds, nbs_pro_rcds, pro_rcds) = ([], [], []) for line in fhi: line = line.strip("\n") gid, size, doms, e, domstr, tag, beg, end = line.split("\t") if tag == "": continue beg, end = int(beg), int(end) cds = str(cds_dict[gid].seq[(beg - 1) * 3 : end * 3]) pro = str(pro_dict[gid].seq[(beg - 1) : end]) cds_rcd = SeqRecord(Seq(cds), id=gid, description="") pro_rcd = SeqRecord(Seq(pro), id=gid, description="") nbs_cds_rcds.append(cds_rcd) nbs_pro_rcds.append(pro_rcd) pro_rcds.append(pro_dict[gid]) print >> fho, line fho.close() SeqIO.write(nbs_cds_rcds, fn_cds, "fasta") SeqIO.write(nbs_pro_rcds, fn_pro, "fasta") SeqIO.write(pro_rcds, fo_pro, "fasta")
def __init__(self, ref_fpath, orf_seq_fpath): self.orf_suffix = '_orf_seq' self.ref_index = SeqIO.index(ref_fpath, 'fasta') self.orf_seq_index = SeqIO.index(orf_seq_fpath, 'fasta') blossum_path = join(DATA_DIR, 'blossum90.csv') self.blosum = _parse_blossum_matrix(blossum_path) self.conf = {}
def seq_getter(transcripome, proteins, cds, blast, trinotate, names_file, Output_prefix): """function to get the nt_seq/pep of genes of interest (names)- then opens up the annotation databse file and get the corresponsing protein seq which was predicted by transdecoder.""" wanted = wanted_genes(names_file) ########################################################################################## # This is the trinnotate sql database in tab format #COLOUM 0 IS THE GENE/ COMPONENT ID #COLOUMN 1 IS THE TRANSCRIPT ID #COLOUMN 5 IS THE PROTEIN ID #gene_id transcript_id sprot_Top_BLASTX_hit TrEMBL_Top_BLASTX_hit RNAMMER prot_id prot_coords sprot_Top_BLASTP_hit TrEMBL_Top_BLASTP_hit Pfam SignalP TmHMM eggnog gene_ontology_blast gene_ontology_pfam transcript peptide ########################################################################################## transcript_info_dict = tabular_file_to_info(trinotate) transcript_to_protein_dict = transcript_to_protein(trinotate) top_blast_hit_dict = top_blast_hit_database(blast) #nucleotide seq out nucleotide_out_file = Output_prefix+".nt.fasta" protein_out_file = Output_prefix+"_cds.pep.fasta" cds_nt_file_out = Output_prefix+"_cds.nt.fasta" #nt out f_nt_out = open(nucleotide_out_file, 'w') #protein file out f_PROTEIN_out = open(protein_out_file, 'w') #cds_file out f_cds_out = open(cds_nt_file_out, 'w') #index the fasta files transcriptome_record_db = SeqIO.index(transcripome, "fasta") cds_record_db = SeqIO.index(cds, "fasta") pep_record_db = SeqIO.index(proteins, "fasta") for i in wanted: if i in transcriptome_record_db: record = transcriptome_record_db[i] SeqIO.write(record, f_nt_out, "fasta") transdecoder_protein = transcript_to_protein_dict[i] if transdecoder_protein != ".": peprecord = pep_record_db[transdecoder_protein] try: peprecord.description = top_blast_hit_dict[transdecoder_protein]+"\t"+transcript_info_dict[i] except KeyError: # Join the three fields with spaces into one string: peprecord.description = transcript_info_dict[i]#" ".join(transcript_info_dict[i]) SeqIO.write(peprecord, f_PROTEIN_out, "fasta") cds_of_interest = cds_record_db[transdecoder_protein] cds_of_interest.description = peprecord.description # TODO - fill in descr SeqIO.write(cds_of_interest, f_cds_out, "fasta") f_nt_out.close() f_cds_out.close() f_PROTEIN_out.close() return True
def seq_getter(blast_file, cds_file, protein_file, known_seq_db, known_name_list, folder_name): """Function to convert a top blast anlaysis query versus seq to a file containing these sequences.""" print("current cds_file is false") cds_file = False if cds_file: # if we have a nt file... the creat files associated # with what is in the file. + get a filename dict nhandles = generate_dict_of_files(known_name_list, folder_name, "cds") if protein_file: # if we have a AA file... the creat files associated # with what is in the file. + get a filename dict phandles = generate_dict_of_files(known_name_list, folder_name, "pep") if protein_file: # index protein_sequences = SeqIO.index(protein_file, "fasta") if cds_file: nucleotide_sequences = SeqIO.index(cds_file, "fasta") print("Starting output...") names_already_printed = set([]) blast_hits_wanted = open_blast_file(blast_file) for line in blast_hits_wanted: if not test_line(line): continue gene, blast_hit_matches = parse_blast_tab_hit(line) if cds_file: # get the nt seq from the gene models file seq_record = nucleotide_sequences[blast_hit_matches] SeqIO.write(seq_record, nhandles[gene], "fasta") if protein_file: # get the AA seq from the gene models file seq_record = protein_sequences[blast_hit_matches] SeqIO.write(seq_record, phandles[gene], "fasta") if not gene in names_already_printed: # get the seq from the known db seq_record = known_seq_db[gene] SeqIO.write(seq_record, phandles[gene], "fasta") names_already_printed.add(gene) # loop to close all the open files. There could be many!! if cds_file: for gene in known_name_list: nhandles[gene].close() for gene in known_name_list: phandles[gene].close()
def simple_check(self, filename, format, alphabet): if format in SeqIO._BinaryFormats: mode = "rb" else : mode = "r" id_list = [rec.id for rec in \ SeqIO.parse(open(filename, mode), format, alphabet)] #Without key_function rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) #Check with key_function key_list = [add_prefix(id) for id in id_list] rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list)
def FastaGeneIDExtract(inffile, ids): outFile = open("temp.fas", "w") records_all = SeqIO.index(inffile, "fasta") heads = [] for id in ids: heads.append(id) count = 1 for head in heads: outFile.write(str(">" + head + "\n" + records_all[head].seq + "\n")) count = count + 1 outFile.close() print "Filtered fasta sequences extracted: ", count records_all = SeqIO.index("temp.fas", "fasta") os.system("rm temp.fas -f ") return records_all
def get_raw_check(self, filename, format, alphabet): if format in SeqIO._BinaryFormats: #This means SFF at the moment, which does not get #implement the get_raw method return handle = open(filename, "rU") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assert_(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assert_(raw.strip()) self.assert_(raw in raw_file) if format in ["ig"]: #These have a header structure and can't be parsed #individually (at least, not right now). continue rec1 = rec_dict[key] rec2 = SeqIO.read(StringIO(raw), format, alphabet) self.assertEqual(True, compare_record(rec1, rec2))
def compute_scores(self): def score(record): seq = record.seq fracHelix = self.frac(seq, r'[HGI]') fracSheet = self.frac(seq, r'[BE]') fracBend = self.frac(seq, r'[STL]') fracLoop = self.frac(seq, r'C') return (fracHelix, fracSheet, fracBend, fracLoop) dtype=[('PrimaryID', '|S20'), ('fracHelix', float), ('fracSheet', float), ('fracBend', float), ('fracLoop', float), ] fastaPath = self.rawInputFilePath recordDict = SeqIO.index(fastaPath, "fasta") proteinIds = set([]) allScores = [] for key in recordDict.keys(): match = re.search(r'(\w+)\|.*dssp', key) if match: pid = match.group(1) if pid not in proteinIds: seqRec = recordDict.get(key) allScores.append((pid,) + score(seqRec)) proteinIds.add(pid) allScores = np.array(allScores, dtype=dtype) return allScores
def reduce(self, long_percent=10, merged_path=None, output_path=None): """Selects the longest 10% proteins from the merged fasta file Parameters ---------- long_percent : float, optional Determines the percentage of long proteins to be used for creating protein families merged_path : basestring, optional Path to merged faa file. If None, the path used by self.merge_genomes_files is used. output_path : basestring, optional Output path. If None, saves 'reduced.faa' in self.output_dir. """ if output_path is None: output_path = os.path.join(self._output_dir, 'reduced.faa') if merged_path is None: if not hasattr(self, 'merged_path_'): raise ValueError('No merged fasta file') merged_path = self.merged_path_ lens_and_ids = sorted([(len(rec), rec.id) for rec in SeqIO.parse(merged_path, 'fasta')], reverse=True) ids = [id for (length, id) in lens_and_ids] del lens_and_ids ids = ids[: len(ids) // long_percent] rec_index = SeqIO.index(merged_path, 'fasta') with open(output_path, 'wb') as out_file: for id in ids: SeqIO.write(rec_index[id], out_file, 'fasta') print 'Saving reduced proteins as {}'.format(output_path) self.reduced_path_ = os.path.abspath(output_path)
def fetch_fasta_db( table_name, download_url, fasta_filename=None, key_column='id', value_column='seq', subdir=None, version=1): """ Download a FASTA file from `download_url` and store it locally as a sqlite3 database. """ base_filename = normalize_filename(split(download_url)[1]) db_filename = "%s.%s.%s.db" % (base_filename, key_column, value_column) fasta_path = fetch_file( download_url=download_url, filename=fasta_filename, subdir=subdir, decompress=True) fasta_dict = SeqIO.index(fasta_path, 'fasta') table = DatabaseTable.from_fasta_dict( table_name, fasta_dict, key_column=key_column, value_column=value_column) db_path = build_path(db_filename, subdir) return _create_cached_db( db_path, tables=[table], version=version)
def find_longest_components(filename1, cds_database, out_filename): """this is a function to open up a fasta file, and producea a list of the longest representative transcripts per gene. This is only called is there are duplicated found with the same prefix name. Returns a new cds database with the longest cds per transcript only.""" # this is out list of so called longest matches which we will # append and remove as applicable top_hits = [] # current sequence length score value "to beat" current_length = int(0) # set up variables to assgn lastest values to ... transcriptome_Genes_names = set([]) last_gene_name = "" last_component = "" loop_count = 0 for seq_record in SeqIO.parse(filename1, "fasta"): sequence_len = len(seq_record) sequence_name = seq_record.id component = strip_to_match_transcript_name(sequence_name) # first time we see any record, save the values: if loop_count == 0: loop_count = loop_count + 1 last_gene_name = sequence_name current_length = sequence_len last_component= component top_hits.append(seq_record.id) ######################################### # first block: if the names are the same, # is the new length of sequence longer? if component == last_component: # print ("yes:", component, "component", last_component, # "last_component", seq_record.id) # print ("current_length", current_length) if sequence_len > current_length: # print ("sequence_len > current_length", sequence_len, #current_length) del top_hits[-1] top_hits.append(seq_record.id) ########################################################## # second block: if the name is new, put it in the name set. # use this sequence-length as the new one to "beat" else: top_hits.append(seq_record.id) last_gene_name = sequence_name current_length = sequence_len last_component= component outfile = open(out_filename, "w") for i in top_hits: seq_record = cds_database[i] SeqIO.write(seq_record, outfile, "fasta") outfile.close() cds_database_new = SeqIO.index(out_filename, "fasta", key_function=strip_to_match_transcript_name) return cds_database_new
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def blastclust_to_fasta(infname, seqfname, outdir): """Converts input BLASTCLUST output list to a subdirectory of FASTA files. Each individual FASTA file contains all sequences from a single cluster. The sequences matching the IDs listed in the BLASTCLUST output .lst file should all be found in the same file. Returns the output directory and a list of the files, as a tuple. """ outdirname = os.path.join(outdir, "blastclust_OTUs") if not os.path.exists(outdirname): os.makedirs(outdirname) seqdict = SeqIO.index(seqfname, 'fasta') outfnames = [] with open(infname, 'r') as fh: otu_id = 0 for line in fh: otu_id += 1 outfname = os.path.join(outdirname, "blastclust_OTU_%06d.fasta" % otu_id) SeqIO.write((seqdict[key] for key in line.split()), outfname, 'fasta') outfnames.append(outfname) return (outdirname, outfnames)
def rename_seq_id_to_bin_id(bin_fa_fns, header_prefix="scaffold", sample_ids_abbrev={"GZ-Xyl_Y2":"GX2", "GZ-Xyl_Y1":"GX1", "GZ-Seed_Y0":"GS0", "GZ-Cell_Y1":"GC1", "GZ-Cell_Y2":"GC2", "SWH-Xyl_Y2":"SX2", "SWH-Xyl_Y1":"SX1", "SWH-Seed_Y0":"SS0", "SWH-Cell_Y1":"SC1", "SWH-Cell_Y2":"SC2", "SWH-Cell55_Y2":"S52"}): print("Combining bin files (" + str(len(bin_fa_fns)) + ")") bin_id_map = {} for bin_fa_fn in bin_fa_fns: print("Reading from " + bin_fa_fn) sample_id = os.path.basename(bin_fa_fn) bin_id = sample_id[::-1].split(".", 1)[1][::-1] sample_id = bin_id.split(".", 1)[0] new_sample_id = sample_ids_abbrev[sample_id] print(sample_id + " to " + new_sample_id) if sample_id in sample_ids_abbrev.keys(): bin_id = bin_id.replace(sample_id, new_sample_id) #bin_id = bin_id.replace(".", "_") print("bin_id=" + bin_id) seqs = SeqIO.index(bin_fa_fn, "fasta") seq_ids = list(seqs.keys()) m = {seq_id.replace(header_prefix, new_sample_id):bin_id for seq_id in seq_ids} bin_id_map.update(m) else: print(sample_id + " does not exist in the provided sample id list.") return bin_id_map
def extract_species_info_from_fasta(fa_fn, out_fn=None): import re from Bio import SeqIO print("Processing " + fa_fn) seqs = SeqIO.index(fa_fn, "fasta") seq_ids = list(seqs.keys()) unknown_n = 0 info_n = 0 if out_fn is None: out_fn = fa_fn + ".gi" info_map = {} for seq_id in seq_ids: gi = seq_id.split("|")[1] desc = seqs[seq_id].description species = re.findall("\[(.+)\]", desc) if len(species) == 1: species = species[0] info_n = info_n + 1 else: species = "Unknown" unknown_n = unknown_n + 1 info_map[gi] = species sorted_map = sorted(info_map.items(), key=lambda x:x[1]) OUT = open(out_fn, "w") for (gi, species) in sorted_map: OUT.write(gi+"\t"+species+"\n") OUT.close() print("Export: " + str(info_n) + " (Unknown:" + str(unknown_n) + ")")
def __main__(): #Parse Command Line parser = optparse.OptionParser() parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : [email protected]\n\n" "This script create junctions between scaffolds using a tabulated file.\n" "The input tabulated file look as followed:\n" ">chr1\n" "scaffold1 FWD\n" "scaffold2 FWD\n" "scaffold3 REV\n" ">...\n") # Wrapper options. parser.add_option( '', '--table', dest='table', default='not_filled', help='The table file of scaffold to join') parser.add_option( '', '--fasta', dest='fasta', default='not_filled', help='The multi-fasta scaffold file') parser.add_option( '', '--out', dest='out', default='super_contig.fasta', help='The multi-fasta output file name, [default: %default]') parser.add_option( '', '--out_verif', dest='out_verif', default='contig2verif.txt', help='The output file to give to verif_fusion.py, [default: %default]') (options, args) = parser.parse_args() #verifying file verif(options.table) #creating the scaffolds dico_fait = scaff(options.table, options.fasta, options.out, options.out_verif) #printing the remaining scaffold record_dict = SeqIO.index(options.fasta, "fasta") outfile = open(options.out,'a') for n in record_dict: if not(n in dico_fait): SeqIO.write(record_dict[n], outfile, "fasta") outfile.close()
def simple_check(self, filename, format, alphabet) : id_list = [rec.id for rec in \ SeqIO.parse(open(filename), format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet) self.assertEqual(set(id_list), set(rec_dict.keys())) #This is redundant, I just want to make sure len works: self.assertEqual(len(id_list), len(rec_dict)) #Make sure boolean evaluation works self.assertEqual(bool(id_list), bool(rec_dict)) for key in id_list : self.assert_(key in rec_dict) self.assertEqual(key, rec_dict[key].id) self.assertEqual(key, rec_dict.get(key).id) #Check non-existant keys, try : rec = rec_dict[chr(0)] raise ValueError("Accessing a non-existant key should fail") except KeyError : pass self.assertEqual(rec_dict.get(chr(0)), None) self.assertEqual(rec_dict.get(chr(0), chr(1)), chr(1)) #Now check iteritems... for key, rec in rec_dict.iteritems() : self.assert_(key in id_list) self.assert_(isinstance(rec, SeqRecord)) self.assertEqual(rec.id, key) #Now check non-defined methods... self.assertRaises(NotImplementedError, rec_dict.values) self.assertRaises(NotImplementedError, rec_dict.popitem) self.assertRaises(NotImplementedError, rec_dict.pop, chr(0)) self.assertRaises(NotImplementedError, rec_dict.pop, chr(0), chr(1)) self.assertRaises(NotImplementedError, rec_dict.clear) self.assertRaises(NotImplementedError, rec_dict.__setitem__, "X", None) self.assertRaises(NotImplementedError, rec_dict.copy) self.assertRaises(NotImplementedError, rec_dict.fromkeys, [])
def run_single(self, debug=0): warnings.simplefilter('always') warnings.warn("Deprecated method: run_BLAST.run_single\nBLAST single sequence, slow!! ", DeprecationWarning) print("Running AmiGO:BLAST") temp_output = open(self.outfile + "_temp", "w") if self.record_index == None: self.record_index = SeqIO.index(self.infile, "fasta") all_orfs = dict() for key in self.record_index: print key this_seq = GoSequence(key, self.record_index[key].seq) # Bio.SeqRecord.SeqRecord this_seq.blast_AmiGO() this_seq.extract_ID() this_seq.parse_go_term(self.e_threshold) # seq.combined_terms self.results[key] = this_seq all_orfs[key] = this_seq.combined_terms # print this_seq # print this_seq.combined_terms temp_output.write("%s \t %s\n" % (key, this_seq.combined_terms)) # temp_output.flush() # temp_output.close() self.counter = self.create_counter(all_orfs) # new_outfile = self.init_output(self.counter,0) # self.sample = self.update_sample_from_counters(new_outfile, self.counter) # hasattr output_csv(self.outfile, self.header, self.counter)
def run(self, debug=False): print("Running AmiGO:BLAST_Batch") # temp_output = open(self.outfile + "_temp", "w") if self.record_index == None: self.record_index = SeqIO.index(self.infile, "fasta") print "BLAST infile:%s" % self.infile # print self.wdir self.tempfile = self.wdir + "/AmiGO_Record.temp" go = GOConnector(seq_record=self.record_index, max_query_size=self.batch_size, e_value_cut_off=self.e_threshold, tempfile=self.tempfile, debug=self.debug) go.amigo_batch_mode() all_seqs = go.all_seqs all_orfs = dict() for seq in all_seqs: key = seq.seq_id self.results[key] = seq all_orfs[key] = seq.combined_terms # print this_seq # print this_seq.combined_terms # temp_output.write("%s \t %s\n" % (key, seq.combined_terms)) # temp_output.flush() # temp_output.close() self.counter = self.create_counter(all_orfs) # new_outfile = self.init_output(self.counter,0) # self.sample = self.update_sample_from_counters(new_outfile, self.counter) # hasattr output_csv(self.outfile, self.header, self.counter)
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand): 'It creates a function that annotates introns in cdna matching with genomic' genomic_seqs_fhand = get_fhand(genomic_seqs_fhand) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def annotate_intron(sequence): 'It adds the orf to the SeqFeatures' if sequence is None: return try: introns = infer_introns_for_cdna(sequence=sequence, genomic_db=genomic_db, genomic_seqs_index=genomic_seqs_index) except KeyError as error: error = str(error).lstrip('u').strip("'") if 'not found' in error: error += ' in seq file %s, but present in blast db %s' % \ (genomic_seqs_fhand.name, genomic_db) raise RuntimeError(error) for intron_pos in introns: feature = SeqFeature(location=FeatureLocation(intron_pos, intron_pos), type='intron', qualifiers={'genomic_db':genomic_db}) sequence.features.append(feature) return sequence return annotate_intron
def sort_name(source_file, source_file_type, direction=1): """ Sort sequences by name. 1 is ascending (default) and 0 is descending. """ direction_text = 'ascending' if direction == 1 else 'descending' logging.info("Indexing sequences by name: %s", direction_text) # Adapted from the Biopython tutorial example. # Sort on id ids = sorted((rec.id) for rec in SeqIO.parse(source_file, source_file_type)) if direction == 0: ids = reversed(ids) # SeqIO.index does not handle gzip instances if isinstance(source_file, gzip.GzipFile): tmpfile = tempfile.NamedTemporaryFile() source_file.seek(0) tmpfile.write(source_file.read()) tmpfile.seek(0) source_file = tmpfile record_index = SeqIO.index(source_file.name, source_file_type) for id in ids: yield record_index[id]
def main(argv): wd_dir = "." aln_fn_ext = "aln" aln_fns = glob.glob(wd_dir + "/*." + aln_fn_ext) aln_format = "fasta" export_fn_ext = "renamed" map_fn = "map.lst" id_map = {} for aln_fn in aln_fns: print("Processing " + aln_fn) seqs = SeqIO.index(aln_fn, aln_format) out_fn = aln_fn + "." + export_fn_ext with open(out_fn, "w") as OUT: for id in seqs.keys(): cflag_existed = False while not cflag_existed: new_id = generate_id(id) if new_id not in id_map.keys(): cflag_existed = True #print("Maping " + id + " to " + new_id) id_map[new_id] = id + "\t" + os.path.basename(aln_fn) seq = seqs[id] seq.id = new_id seq.name = new_id seq.description = "" SeqIO.write(seq, OUT, "fasta") with open(map_fn, "w") as OUT: for id in id_map: OUT.write(id + "\t" + id_map[id] + "\n")
def create_biopython_iterator(self, **kwargs): from Bio import SeqIO print "Generating BioPython sequence index. This may take a moment...." self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values)
def ById (self, event): box=wx.TextEntryDialog(None, "Enter Sequence ID\nFor Multiple IDs use comma (,) as separator", "Sequence ID", "ID") if box.ShowModal() == wx.ID_OK: IDlist = box.GetValue() inFile = self.globalFile self.logger.AppendText("Input file: "+inFile+"\n"+"Start Time: "+str(time.asctime())+"\n") start_time = time.time() FastaFile=SeqIO.index(inFile, "fasta") outFile=open(inFile+"_by_ID.fasta", "w") IDlist=IDlist.replace(" ","") IDs=IDlist.split(",") for ID in IDs: if ID in FastaFile: outFile.write(FastaFile.get_raw(ID)) self.logger.AppendText("Wrote "+inFile+"_by_ID.fasta"+" "+str(time.asctime())+"\n") if ID not in FastaFile: self.logger.AppendText(ID+" not present in the Fasta file\n") outFile.close() end_time=str(time.time() - start_time) self.logger.AppendText("Fasta file making by ID is completed\n") self.logger.AppendText("Finish Time: "+str(time.asctime())+"\nTime elapsed: "+end_time+" seconds\n") self.logger.AppendText("--------------------------------------------------------------------------------------------------------\n\n") wx.MessageBox("Fasta file making by ID is completed\nTime elapsed: "+end_time+" seconds")
def seq_getter(filename_in, wantedfile, threshold, outfile): "script to gt sequences of intereste from a file of wanted genes" f= open(outfile, 'w') wanted = open(wantedfile, "r") names = wanted.readlines() #print names wanted_data = [line.replace("\t", "").rstrip("\n") for line in names if line.strip() != ""] name_set = set([]) for i in wanted_data: if not i.startswith("#"): i = i.rstrip() name_set.add(i) #print wanted_data cds_database = SeqIO.index(filename_in, "fasta") #record = SeqIO.read(filename, "fasta") for i in name_set: if "\r\n" in i: i = i.replace("\r\n","") #print i seq_record = cds_database[i] dashes = seq_record.seq.count("-") print 100*(float(dashes)/len(seq_record.seq)) if 100*(float(dashes)/len(seq_record.seq)) < int(threshold): #print 'boomshanka' SeqIO.write(seq_record, f, "fasta") f.close() return True
def shortrna_regions(mirna_gff, star_csv, seq_file): """Return miRNA sequences with corresponding guide and star regions. """ seq_index = SeqIO.index(seq_file, "fasta") mirna_seqs = dict() with open(star_csv) as in_handle: for name, guide, star in csv.reader(in_handle): mirna_seqs[name] = (guide.strip(), star.strip()) for rec in GFF.parse(mirna_gff): cur_seq = str(seq_index[rec.id].seq) for f in rec.features: name = f.qualifiers["ID"][0] start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end) yield (rec.id, start, end, name) #guide, star = mirna_seqs.get(name, ("", "")) for seq_name, guide, star in [(n, g, s) for n, (g, s) in mirna_seqs.iteritems() if n.startswith(name)]: for find_seq, ext in [(guide, "guide"), (star, "star")]: if find_seq: if f.strand == -1: find_seq = str(Seq(find_seq).reverse_complement()) region = cur_seq[start:end] pos = region.find(find_seq) if pos > -1: yield (rec.id, start + pos, start + pos + len(find_seq), "%s_%s" % (seq_name, ext)) else: print f.strand, name, ext, pos, find_seq, region raise NotImplementedError
def extract_long_reads(): """Filter fastq to longest reads.""" parser = argparse.ArgumentParser(description='Extract longest reads from a fastq.') parser.add_argument('input', help='Input .fastq file.') parser.add_argument('output', help='Output .fastq file.') parser.add_argument('longest', default=10, type=int, help='Percentage of longest reads to partition.') parser.add_argument('--others', default=None, help='Write all other reads to file.') args = parser.parse_args() record_dict = SeqIO.index(args.input, "fastq") ids = list(record_dict.keys()) lengths = np.fromiter( (len(record_dict[i]) for i in ids), dtype=int, count=len(ids) ) max_reads = len(ids) * (args.longest / 100) longest = np.argpartition(lengths, -max_reads)[-max_reads:] SeqIO.write( (record_dict[ids[i]] for i in longest), args.output, 'fastq' ) if args.others is not None: longest = set(longest) SeqIO.write( (record_dict[ids[i]] for i in range(len(ids)) if i not in longest), args.others, 'fastq' )
def sort_length(source_file, source_file_type, direction=1): """ Sort sequences by length. 1 is ascending (default) and 0 is descending. """ direction_text = 'ascending' if direction == 1 else 'descending' logging.info('Indexing sequences by length: %s', direction_text) # Adapted from the Biopython tutorial example. # Get the lengths and ids, and sort on length len_and_ids = sorted((len(rec), rec.id) for rec in SeqIO.parse(source_file, source_file_type)) if direction == 0: ids = reversed([seq_id for (length, seq_id) in len_and_ids]) else: ids = [seq_id for (length, seq_id) in len_and_ids] del len_and_ids # free this memory # SeqIO.index does not handle gzip instances if isinstance(source_file, gzip.GzipFile): tmpfile = tempfile.NamedTemporaryFile() source_file.seek(0) tmpfile.write(source_file.read()) tmpfile.seek(0) source_file = tmpfile record_index = SeqIO.index(source_file.name, source_file_type) for seq_id in ids: yield record_index[seq_id]
def simple_check(self, filename, format, alphabet, comp): """Check indexing (without a key function).""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [ rec.id for rec in SeqIO.parse(filename, format, alphabet) ] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return # In memory, # note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict # check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy") self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"]) # Saving to file... index_tmp = self.index_tmp if os.path.isfile(index_tmp): os.remove(index_tmp) # To disk, # note here we give the filename as a single string # to confirm that works too (convience feature). rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload without passing filenames and format # and switch directory to check paths still work index_tmp = os.path.abspath(index_tmp) os.chdir(os.path.dirname(filename)) rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
import sys import os import os.path import shutil import subprocess from Bio import SeqIO from Bio.SeqRecord import SeqRecord Barrnap = "/home/j/jparkins/mobolaji/Tools/Barrnap/bin/barrnap" Infernal = "/home/j/jparkins/mobolaji/Tools/Infernal/infernal-1.1.2-linux-intel-gcc/binaries/cmsearch" Rfam = "/home/j/jparkins/mobolaji/Databases/Rfam_rRNA.cm" reference_file = sys.argv[1] reference_sequences = SeqIO.to_dict(SeqIO.parse(reference_file, "fastq")) dedeplicated_file = sys.argv[2] dedeplicated_sequences = SeqIO.index(dedeplicated_file, "fastq") cluster_file = sys.argv[3] cluster_map = {} reduplicated_file = sys.argv[4] reduplicated_ids = set() reduplicated_seqs = [] with open(cluster_file, "r") as clustr_read: rep = "" seq_id = "" for line in clustr_read: if line.startswith(">"): continue elif line.startswith("0"): rep = line[line.find(">") + 1:line.find("...")] seq_id = rep
#hg38_pairwise_part = record_dict_hg38["NC_000001.11"].seq[1622920:1634474].upper() #hg38_pairwise_part = record_dict_hg38["NC_000001.11"].seq[1626620:1642639].upper() #hg37 = SeqIO.parse("/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/grch37/Homo_sapiens.GRCh37.dna_sm.primary_assembly.refseqids.fa", "fasta") #hg37_pairwise_part = record_dict_hg37["NC_000001.10"].seq[1558300:1569850].upper() #hg37_pairwise_part = record_dict_hg37["NC_000001.10"].seq[1562000:1578000].upper() #alignments_g = pairwise2.align.globalxx(hg38_pairwise_part, hg37_pairwise_part) #print(format_alignment(*alignments_g[0])) #alignments_l = pairwise2.align.localxx(hg38_pairwise_part, hg37_pairwise_part) #print(format_alignment(*alignments_l[0])) data = pd.read_csv('data.csv') record_dict_hg38 = SeqIO.index( "/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/1405.15/GRCh38_no_alt_analysis_set.refseqids.fna", "fasta") record_dict_hg37 = SeqIO.index( "/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/grch37/Homo_sapiens.GRCh37.dna_sm.primary_assembly.refseqids.fa", "fasta") def hg37_fun(d, dict37): #print(d['hg37_chr'], d['hg37_start'], d['hg37_end']) for index, row in d.head(n=10).iterrows(): hg37_p = dict37[row['hg37_chr']].seq[ int(row['hg37_start']):int(row['hg37_end'])].upper() #print("this should be the hg37 sequence") return hg37_p
#count reads inputfile = sys.argv[1] outputfile = sys.argv[2] chunksize = int(str( sys.argv[3])) #number of reads in each division of the input file threads = int(sys.argv[4]) fmt = sys.argv[5] #input format subprocess.Popen("rm -rf ~/scripts/tmp2", shell=True).wait() subprocess.Popen("mkdir ~/scripts/tmp2", shell=True).wait() out1 = open('/OSM/HOME-MEL/all29c/scripts/tmp2/finaloutput.ublast', 'w') count = SeqIO.index(inputfile, fmt) c = len(count) #print c a = int(c) print "num reads=", a if chunksize > a: chunksize = a numchunks = (a / chunksize) #print"Number of chunks=",numchunks #sys.stdout.write(str(numchunks)) t = 0
def _make_db(self): """ :return: an index of the sequence contains in protfile corresponding to the replicon """ return SeqIO.index(self._prot_file, "fasta", alphabet=Seq.IUPAC.extended_protein)
for line in handle: if line.strip() and not line.startswith("#"): field = line.rstrip("\n").split("\t")[col].strip() parts = field.split(None, 1) if len(parts) > 1 and not warn: warn = "WARNING: Some of your identifiers had white space in them, " + \ "using first word only. e.g.:\n%s\n" % field yield parts[0] handle.close() if warn: sys.stderr.write(warn) # Index the sequence file. # If very big, could use SeqIO.index_db() to avoid memory bottleneck... records = SeqIO.index(in_file, seq_format) print("Indexed %i sequences" % len(records)) if seq_format.lower() == "sff": # Special case to try to preserve the XML manifest try: from Bio.SeqIO.SffIO import SffWriter except ImportError: sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
description= 'Extract a subset of unique sequences, starting from a FASTA file \ containing the parent sequences', ) parser.add_argument('--fasta_file', nargs=1, required=True, help='FASTA file to be processed') parser.add_argument('--output_file', nargs=1, required=True, help='Output file') args = parser.parse_args() sequences_dict = SeqIO.index(args.fasta_file[0], 'fasta') sequences = sorted(sequences_dict.keys()) print "Loaded {0:s} sequences".format(str(len(sequences))) unique_seqs = [] duplicated_seqs = [] i = 0 j = 0 # Iterate over sequences ids while i < len(sequences): # Extract master sequence for comparison master_seq = sequences_dict[sequences[i]] duplicated_seqs = []
def PPHMMSignatureTable_Constructor( SeqIDLists, GenBankFile, TranslTableList, HMMER_PPHMMDB, HMMER_hmmscanDir, HMMER_N_CPUs=7, HMMER_C_EValue_Cutoff=1E-3, HMMER_HitScore_Cutoff=0, SeqLength_Cutoff=0, ): #Load GenBank record #--------------------------------------------------------------------- _, file_extension = os.path.splitext(GenBankFile) if file_extension in [".fas", ".fasta"]: Records_dict = SeqIO.index(GenBankFile, "fasta") elif file_extension in [".gb"]: Records_dict = SeqIO.index(GenBankFile, "genbank") Records_dict = {k.split(".")[0]: v for k, v in Records_dict.iteritems()} N_Seq = len(SeqIDLists) #Specify PPHMMQueryFile and PPHMMScanOutFile #--------------------------------------------------------------------- PPHMMDB_Summary = HMMER_PPHMMDB + "_Summary.txt" N_PPHMMs = LineCount(PPHMMDB_Summary) - 1 PPHMMQueryFile = HMMER_hmmscanDir + "/QProtSeqs.fasta" PPHMMScanOutFile = HMMER_hmmscanDir + "/PPHMMScanOut.txt" PPHMMSignatureTable = np.empty((0, N_PPHMMs)) PPHMMLocMiddleBestHitTable = np.empty((0, N_PPHMMs)) Seq_i = 0.0 for SeqIDList, TranslTable in zip(SeqIDLists, TranslTableList): GenBankSeqList = [] GenBankIDList = [] GenBankDescList = [] for SeqID in SeqIDList: GenBankRecord = Records_dict[SeqID] GenBankSeqList.append(GenBankRecord.seq) GenBankIDList.append(GenBankRecord.id) GenBankDescList.append(GenBankRecord.description) #sort lists by sequence/segment lengthes #--------------------------------------------------------------------- (GenBankSeqLenList, GenBankSeqList, GenBankIDList, GenBankDescList) = zip( *sorted(zip(map(len, map(str, GenBankSeqList)), GenBankSeqList, GenBankIDList, GenBankDescList), reverse=True)) GenBankSeq = "" for seq in GenBankSeqList: GenBankSeq = GenBankSeq + seq if len( GenBankSeq ) >= SeqLength_Cutoff: #limit the sequence by length; 0=include sequences of all lengths GenBankID = "/".join(GenBankIDList) GenBankDesc = "/".join(GenBankDescList) ProtSeq1 = SeqRecord(GenBankSeq[0:].translate(table=TranslTable), id=GenBankID + '_+1') ProtSeq2 = SeqRecord(GenBankSeq[1:].translate(table=TranslTable), id=GenBankID + '_+2') ProtSeq3 = SeqRecord(GenBankSeq[2:].translate(table=TranslTable), id=GenBankID + '_+3') ProtSeqC1 = SeqRecord( GenBankSeq.reverse_complement()[0:].translate( table=TranslTable), id=GenBankID + '_-1') ProtSeqC2 = SeqRecord( GenBankSeq.reverse_complement()[1:].translate( table=TranslTable), id=GenBankID + '_-2') ProtSeqC3 = SeqRecord( GenBankSeq.reverse_complement()[2:].translate( table=TranslTable), id=GenBankID + '_-3') ProtSeq6frames = ProtSeq1 + ProtSeq2 + ProtSeq3 + ProtSeqC1 + ProtSeqC2 + ProtSeqC3 ProtSeq6frames.id = GenBankID with open(PPHMMQueryFile, "w") as PPHMMQuery_txt: SeqIO.write(ProtSeq6frames, PPHMMQuery_txt, "fasta") p = subprocess.Popen( "hmmscan --cpu %s --noali --nobias --domtblout %s %s %s" % (HMMER_N_CPUs, PPHMMScanOutFile, HMMER_PPHMMDB, PPHMMQueryFile), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out, err = p.communicate() PPHMMIDList = [] PPHMMScoreList = [] FeatureFrameBestHitList = [] FeatureLocFromBestHitList = [] FeatureLocToBestHitList = [] FeatureDescList = [] with open(PPHMMScanOutFile, "r") as PPHMMScanOut_txt: for Line in PPHMMScanOut_txt: if Line[0] != "#": Line = Line.split() Line[22] = " ".join( Line[22:] ) #Concatenate the cluster description back Line = Line[:23] C_EValue = float(Line[11]) HitScore = float(Line[7]) OriAASeqlen = float(len(GenBankSeq)) / 3 if C_EValue < HMMER_C_EValue_Cutoff and HitScore > HMMER_HitScore_Cutoff: #Determine the frame and the location of the hit #------------------------------------------------------ ID = int(Line[0].split('_')[-1]) HitFrom = int(Line[17]) HitTo = int(Line[18]) HitMid = float(HitFrom + HitTo) / 2 if np.ceil(HitMid / OriAASeqlen) <= 3: Frame = int(np.ceil(HitMid / OriAASeqlen)) else: Frame = int(-(np.ceil(HitMid / OriAASeqlen) - 3)) LocFrom = int(HitFrom % OriAASeqlen) if LocFrom == 0: #if the hit occurs preciously from the end of the sequence LocFrom = int(OriAASeqlen) LocTo = int(HitTo % OriAASeqlen) if LocTo == 0: #if the hit occurs preciously to the end of the sequence LocTo = int(OriAASeqlen) if LocTo < LocFrom: #The hit (falsely) spans across sequences of different frames if np.ceil(HitFrom / OriAASeqlen) <= 3: HitFrom_Frame = int( np.ceil(HitFrom / OriAASeqlen)) else: HitFrom_Frame = int( -(np.ceil(HitFrom / OriAASeqlen) - 3)) if np.ceil(HitTo / OriAASeqlen) <= 3: HitTo_Frame = int( np.ceil(HitTo / OriAASeqlen)) else: HitTo_Frame = int( -(np.ceil(HitTo / OriAASeqlen) - 3)) if Frame == HitFrom_Frame: LocTo = int(OriAASeqlen) elif Frame == HitTo_Frame: LocFrom = int(1) elif HitFrom_Frame != Frame and Frame != HitTo_Frame: LocFrom = int(1) LocTo = int(OriAASeqlen) else: print( "Something is wrong with the his location determination" ) raw_input("Press any key to continue") if ID not in PPHMMIDList: Best_C_EValue = C_EValue PPHMMIDList.append(ID) PPHMMScoreList.append(HitScore) FeatureDescList.append(Line[22].split('|')[0]) FeatureFrameBestHitList.append(Frame) FeatureLocFromBestHitList.append(LocFrom * 3) FeatureLocToBestHitList.append(LocTo * 3) else: if C_EValue < Best_C_EValue: Best_C_EValue = C_EValue FeatureFrameBestHitList[-1] = Frame FeatureLocFromBestHitList[-1] = LocFrom * 3 FeatureLocToBestHitList[-1] = LocTo * 3 FeatureLocMiddleBestHitList = np.zeros(N_PPHMMs) FeatureLocMiddleBestHitList[PPHMMIDList] = np.mean( np.array([FeatureLocFromBestHitList, FeatureLocToBestHitList]), axis=0 ) * ( np.array(FeatureFrameBestHitList) / abs(np.array(FeatureFrameBestHitList)) ) #Absolute coordinate with orientation info encoded into it: +ve if the gene is present on the (+)strand, otherwise -ve PPHMMLocMiddleBestHitTable = np.vstack( (PPHMMLocMiddleBestHitTable, FeatureLocMiddleBestHitList)) FeatureValueList = np.zeros(N_PPHMMs) FeatureValueList[PPHMMIDList] = PPHMMScoreList PPHMMSignatureTable = np.vstack( (PPHMMSignatureTable, FeatureValueList)) Seq_i = Seq_i + 1.0 #Progress bar sys.stdout.write( "\033[K" + "Generate PPHMM signature and location profiles: [%-20s] %d/%d profiles" % ('=' * int(Seq_i / N_Seq * 20), Seq_i, N_Seq) + "\r") sys.stdout.flush() sys.stdout.write("\033[K") sys.stdout.flush() return (PPHMMSignatureTable, PPHMMLocMiddleBestHitTable)
start of the region - the start base will be included and this is 1-indexed (i.e. use start=1 to start at the beginning of the contig) end of the region - the end is included in the output ploidy (This script obviously only makes sense if variants are phased. One sequence per genome copy will be generated) path to the output file. This will be in fasta format. An 8th arguement can optionally be supplied and should specify the path to a file containing a list of individuals to be included. If not supplied, all individuals will be used.''' print 'Alignment-from-vcf, written by Stephan Kamrad ([email protected])' if len(sys.argv) not in [8, 9]: print helpstr sys.exit('Invalid arguements supplied') ref_path = sys.argv[1] print 'Reading the reference genome: ' + ref_path ref = SeqIO.index(ref_path, "fasta") vcf_path = sys.argv[2] print 'Reading the vcf: ' + vcf_path vcffile = VariantFile(vcf_path) contig = sys.argv[3] start = int(sys.argv[4]) end = int(sys.argv[5]) print 'Getting variants in region %s:%i-%u' % (contig, start, end) variants = list(vcffile.fetch(contig, start - 1, end)) ref_seq = ref[contig].seq[start - 1:end] if len(variants) == 0: raise Exception('No variants in specified region. Terminating.')
def prepare_for_ParaAT(joined_df_file_name, id_file, out_dir): ''' input 1:joined_df_file_name input 2:id_file output 1:out_dir ''' global coding_gene_base, protein_base, gene_fasta_fl, protein_fasta_fl gene_dir_path = directory_creater(out_dir / "nucleotide") protein_dir_path = directory_creater(out_dir / "aminoacid") fasta_base_dir_path = directory_creater(out_dir / "gene_protein_base") homolog_dir_path = directory_creater(out_dir / "homolog") protein_base_file_path = fasta_base_dir_path / "protein_base.fasta" coding_gene_base_file_path = fasta_base_dir_path / "coding_gene_base.fasta" if coding_gene_base_file_path.is_file() is False: coding_gene_base_list = [ 'cat', '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_transcripts.fasta', "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_CDS.fasta" ] protein_base_list = [ 'cat', '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_proteins_T0.fasta', "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_protein.fasta" ] merge_to_one(coding_gene_base_list, id_file, "_CDS.fasta", fasta_base_dir_path / "coding_gene_list.txt", coding_gene_base_file_path, fasta_base_dir_path / "coding_gene_cat_err.txt") merge_to_one(protein_base_list, id_file, "_protein.fasta", fasta_base_dir_path / "protein_list.txt", protein_base_file_path, fasta_base_dir_path / "protein_cat_err.txt") coding_gene_base = SeqIO.index(str(coding_gene_base_file_path), "fasta", key_function=get_id_protein) protein_base = SeqIO.index(str(protein_base_file_path), "fasta", key_function=get_id_protein) base = importr("base") utils = importr("utils") ortholog_joined_df = utils.read_table(joined_df_file_name, sep="\t", header=True, **{'stringsAsFactors': False}, **{'check.names': False}) ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1) for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)): df_row = ortholog_joined_df_sub.rx(i, True) df_row_iter = iter(df_row) head_list = next(df_row_iter)[0].split() if len(head_list) == 1: gene_fasta = gene_dir_path / (head_list[0] + ".fasta") protein_fasta = protein_dir_path / (head_list[0] + ".fasta") homolog_file_path = homolog_dir_path / (head_list[0] + ".txt") if gene_fasta.is_file() is True: continue with gene_fasta.open('w') as gene_fasta_fl: with protein_fasta.open('w') as protein_fasta_fl: with homolog_file_path.open('w') as homolog_fl: extract_gene(head_list[0]) homolog_fl.write(head_list[0] + "\t") for homolog_id in one_head(df_row_iter): homolog_fl.write(homolog_id + "\t") homolog_fl.write("\n") else: two_head(head_list, df_row_iter)
outfile = open(str(file.split('.')[0]) + '_clean.fa', 'w') for line in infile: if '>' in line: names = line.split() for name in names: if name[0] == '>': outfile.write(name + '\n') else: outfile.write(line) outfile.close() infile.close() record_dict = SeqIO.index( argv[1], "fasta" ) # "parses" a fasta file, creating a dictionary-like object of sequences. not everything is kept in memeory. instead it just records where each record is within the file. parses on demand. # the key is the dictionary is the ">" line in the fasta file min_pro_len = int( argv[2] ) # minimum protein length, within typical VSG protein length, in a.a. contig_outfile = open(argv[3], 'w') # initializing contig output file, reference ORF_outfile = open( argv[4], 'w') # initializing open reading frame output file, what we like noduplicate = [] # initialize array/list orf_dict = {} # initialize dictionary
ANNOT_INFILE = sys.argv[5] if CSV: OUTFILE = './reference.csv' OUTFILE = './reference.fasta' hya_annots_dict = {} max_length_dict = {} avg_cov_dict = dict(line.strip().split('\t') for line in file(AVG_INFILE)) perc_cov_dict = dict( ((line.strip().split('\t'))[0], (line.strip().split('\t'))[3]) for line in file(ZERO_INFILE)) blastn_matches_dict = dict( (line.strip().split('\t'))[0:2] for line in file(BLASTN_INFILE)) apalm_annotations_dict = dict( (line.strip().split('\t'))[0:3:2] for line in file(ANNOT_INFILE)) # gene_name : SeqRecord of gene seq record_dict = SeqIO.index(REF_INFILE, 'fasta') def annotation(): for transcript in blastn_matches_dict: # get name format of transcript model that will match apalm annotation dict apalm_model = '_'.join(blastn_matches_dict[transcript].split('_')[1:]) if apalm_model in apalm_annotations_dict: annotation = apalm_annotations_dict[apalm_model] hya_annots_dict[transcript] = annotation else: hya_annots_dict[transcript] = 'NONE' def max_cont_length(): # split sequence by continuous sequence in genes (i.e. without Ns)
def readGenome(fasta): genome_dict = SeqIO.index(fasta, "fasta") print(len(genome_dict)) return (genome_dict)
align_format = "fasta" #could be clustal prot_align_file = full_path_to_file nuc_fasta_file = "$HOME/scratch/BUSCO_cegma/gene_models/all_nt/all_nt.fasta" name_out = "./back_translated/%s.DNA_backtranslated.fasta" % ( name[:-5]) nuc_align_file = name_out table = 1 try: table = int(table) except: stop_err("Bad table argument %r" % table) prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) AlignIO.write(nuc_align, nuc_align_file, align_format) ##if len(sys.argv) == 4: ## align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] ## nuc_align_file = sys.stdout ## table = 0 ##elif len(sys.argv) == 5: ## align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] ## table = 0 ##elif len(sys.argv) == 6: ## align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgroup_file, downsample_date_excluded, downsample_included, downsample_lineage_size): original_num_seqs = 0 sample_dict = {} var_dict = {} count_dict, num_samples = get_count_dict(in_metadata) most_frequent = get_by_frequency(count_dict, num_samples, band=[0.05, 1.0]) very_most_frequent = get_by_frequency(count_dict, num_samples, band=[0.5, 1.0]) lineage_dict = get_lineage_dict(in_metadata, downsample_lineage_size) outgroups = parse_outgroups(outgroup_file) indexed_fasta = SeqIO.index(in_fasta, "fasta") with open(in_metadata, 'r', newline = '') as csv_in, \ open(out_fasta, 'w', newline = '') as fa_out, \ open(out_metadata, 'w', newline = '') as csv_out: reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect="unix") writer = csv.DictWriter(csv_out, fieldnames=reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect="unix") writer.writeheader() for row in reader: fasta_header = row["sequence_name"] if fasta_header not in indexed_fasta: continue if original_num_seqs % 1000 == 0: now = datetime.datetime.now() print("%s Downsampled from %i seqs to %i seqs" % (str(now), original_num_seqs, len(sample_dict))) original_num_seqs += 1 if fasta_header in outgroups or not should_downsample_row( row, downsample_date_excluded, downsample_included, downsample_lineage_size, lineage_dict): if fasta_header in outgroups: row["why_excluded"] = "" writer.writerow(row) if row["why_excluded"] in [None, "None", "" ] and fasta_header in indexed_fasta: seq_rec = indexed_fasta[fasta_header] fa_out.write(">" + seq_rec.id + "\n") fa_out.write(str(seq_rec.seq) + "\n") else: print(row["why_excluded"], fasta_header, (fasta_header in indexed_fasta)) continue muts = row["nucleotide_variants"].split("|") if len(muts) < max_diff: #if not row["why_excluded"]: # row["why_excluded"] = "downsampled with diff threshold %i" %max_diff writer.writerow(row) continue found_close_seq = False samples = set() low_frequency_muts = [ mut for mut in muts if mut not in most_frequent ] if len(low_frequency_muts) == 0: low_frequency_muts = [ mut for mut in muts if mut not in very_most_frequent ] if len(low_frequency_muts) == 0: low_frequency_muts = muts if len(low_frequency_muts) > max_diff + 1: low_frequency_muts = low_frequency_muts[:max_diff + 1] for mut in low_frequency_muts: if mut in var_dict: samples.update(var_dict[mut]) if downsample_lineage_size: samples = list(samples & set(lineage_dict[row["lineage"]])) for sample in samples: if num_unique(muts, sample_dict[sample]) <= max_diff: found_close_seq = True #if not row["why_excluded"]: # row["why_excluded"] = "downsampled with diff threshold %i" %max_diff writer.writerow(row) break if not found_close_seq: sample_dict[fasta_header] = muts for mut in muts: if mut not in var_dict: var_dict[mut] = [fasta_header] else: var_dict[mut].append(fasta_header) row["why_excluded"] = "" writer.writerow(row) if fasta_header in indexed_fasta: seq_rec = indexed_fasta[fasta_header] fa_out.write(">" + seq_rec.id + "\n") fa_out.write(str(seq_rec.seq) + "\n") now = datetime.datetime.now() print("%s Downsampled from %i seqs to %i seqs" % (str(now), original_num_seqs, len(sample_dict))) # return sample_dict.keys() # def main(): # args = parse_args() # subsample = downsample(args.in_metadata, args.out_metadata, args.in_fasta, args.out_fasta, args.diff, args.outgroups, args.downsample_date_excluded, args.downsample_included, args.downsample_lineage_size) # if __name__ == '__main__': # main()
def write_hash_dict(in_fasta, out_fasta, out_metadata, outgroup_file, in_metadata=None): outgroups = parse_outgroups(outgroup_file) print(outgroups) records = SeqIO.index(in_fasta, "fasta") hash_dict = {} for record_id in records: seq = str(records[record_id].seq) if seq in hash_dict: hash_dict[seq] = hash_dict[seq] + [record_id] else: hash_dict[seq] = [record_id] print("Found %i unique fasta sequences" % len(hash_dict)) date_dict = {} if in_metadata is not None: with open(in_metadata, 'r', newline='') as csv_in: reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect="unix") for row in reader: if row["epi_week"] not in [None, "None", ""]: date_dict[row["sequence_name"]] = int(row["epi_week"]) print("Found", len(date_dict), "epi_week dates") with open(out_fasta, "w") as fasta, open(out_metadata, "w") as metadata: metadata.write("tip,redundant\n") for key, value in hash_dict.items(): if len(value) == 1: r = records[value[0]] fasta.write(">" + r.id + "\n") fasta.write(str(r.seq) + "\n") elif len(value) > 1: r = None for id in value: if id in outgroups: r = records[id] fasta.write(">" + r.id + "\n") fasta.write(str(r.seq) + "\n") value.remove(id) if not r: index = 0 date = 0 if in_metadata is not None: for i, id in enumerate(value): if id in date_dict and date_dict[id] > date: index = i date = date_dict[id] r = records[value[index]] fasta.write(">" + r.id + "\n") fasta.write(str(r.seq) + "\n") value.remove(value[index]) metadata.write(r.id + ",") metadata.write("|".join(value) + "\n")
"WARNING! BigWig file does not allow overlapped items. A wiggle file was generated instead.\n" ) new_output_format = "wiggle" if output_format != "wiggle" and output_file is None: sys.stderr.write( "WARNING! An output filename is needed to save output as {}. " "The result is shown below:\n".format(output_format)) error.append( "WARNING! An output filename is needed to save output as {}. " "The result is shown above.\n".format(output_format)) new_output_format = "wiggle" output_format = new_output_format records = SeqIO.index(input_file, "fasta") records_num = len(records) write_content = generate_write_content() if records_num < 1: # No sequence in fasta file, corrupted sys.stdout.write( "WARNING! {} contains no sequence data.\n".format(input_file)) raise TypeError elif records_num == 1 or one_file: result = open_results_file() # one sequence in fasta file or one output file for all sequences for record in SeqIO.parse(input_file, "fasta"): write_title() generate_result() result.close() else:
"""Coronavirus Variant analysis""" import pkg_resources, os, pandas from Bio import SeqIO from Bio.Data.CodonTable import unambiguous_dna_by_id as codon_table # path to data files internally used by cova DATAPATH = pkg_resources.resource_filename('cova', 'data') # feature table FEATURETABLE = pandas.read_csv(os.path.join(DATAPATH, 'feature_table_modified.txt'), sep='\t', index_col=0) # proteins' sequence data PSEQS = SeqIO.index(os.path.join(DATAPATH, 'protein_extended.faa'), 'fasta') # CDS' sequence data CDS = SeqIO.index(os.path.join(DATAPATH, 'cds_extended.fna'), 'fasta') # reference genome sequence data GENOME = SeqIO.read(handle=os.path.join(DATAPATH, 'genome.fna'), format='fasta') # reference accession REF = GENOME.id.split('.')[0] # codon table CODONTABLE = codon_table[1] # proteins affected by ribosomal slippage RFS = pandas.read_csv(os.path.join(DATAPATH, 'ribosomal_slippage.csv'), index_col=0) # list of input/output file and directory names used and generated by CoVa with open(os.path.join(DATAPATH, 'cova_io_files.txt')) as flob: IOF = [i.strip('\n') for i in flob] # data for sequence typing
elif miss_cleavage == 2: for j in range(0, len(cut_sites) - 3): peptides.append(proseq[cut_sites[j]:cut_sites[j + 1]]) peptides.append(proseq[cut_sites[j]:cut_sites[j + 2]]) peptides.append(proseq[cut_sites[j]:cut_sites[j + 3]]) peptides.append(proseq[cut_sites[-3]:cut_sites[-2]]) peptides.append(proseq[cut_sites[-3]:cut_sites[-1]]) peptides.append(proseq[cut_sites[-2]:cut_sites[-1]]) return peptides handle1 = SeqIO.parse(sys.argv[1], 'fasta') # All_COSMIC_Genes.fasta handle2 = SeqIO.index(sys.argv[2], 'fasta') # Cosmic_mutant_proteins.fasta output = open(sys.argv[3], 'w') peptidome = {} for record in handle1: cds_seq = record.seq aa_seq = cds_seq.translate(to_stop=True, stop_symbol="") peptide_list = TRYPSIN(str(aa_seq), 0) for peptide in peptide_list: if len(peptide) in range(6, 41): if peptide not in peptidome: peptidome[peptide.replace("I", "L")] = 1 print len(peptidome)
from sys import argv import re from Bio import SeqIO script, filename1, filename2, filename3 = argv p = open(filename1) # HMMTOP output file record_dict = SeqIO.index(filename2, "fasta") # FASTA file outfile = open(filename3, "w+") # Filename for extracted sequences # keep the entire sequence if it has 3 < TM < 10 def seqextract(file): for line in file: ids = [] line = line.strip() # print line match = re.search('>HP:\s+\d+\s+(.*?)\s.*?[IN|OUT]\s+(\d+)', line) if match and int(match.group(2)) > 3 and match and int(match.group(2)) < 10: protein_id = match.group(1) print(protein_id + " " + match.group(2)) current = record_dict[protein_id] SeqIO.write(current, outfile, "fasta") else: pass def TMseqextract(file): for line in file: ids = [] line = line.strip()
def codons(readfile, hmmerfile, gene): """ """ dblength = dblengths[gene] threshold = thresholds[gene] coords = _load_hxb2(gene) reads = SeqIO.index(readfile, "fasta") hmmer = SearchIO.read(hmmerfile, "hmmer3-text") counts = dict( (hxb2, {}) for hxb2 in range(coords.hxb2.iloc[0], coords.hxb2.iloc[-1] + 1, 3)) for hit in hmmer.hits: # Skip hits that contain stop codons (indicates wrong frame) if "*" in chain(hsp.aln[1].seq for hsp in hit.hsps): continue id, _, frame = hit.id.rpartition("-") count = int(id.partition("-")[2]) if frame.endswith("'"): seq = str(reads[id].seq.reverse_complement()) offset = int(frame[:-1]) else: seq = str(reads[id].seq) offset = int(frame) for hsp in hit.hsps: if math.log( (dblength * hsp.hit_span) / 2**hsp.bitscore) >= threshold: continue i = 0 # tracks position in the alignment (0-indexed) hmm = hsp.query_start + 1 # tracks position in the HMM reference sequence (1-indexed) read = 3 * hsp.hit_start + offset # tracks position in the read sequence (0-indexed) # read/reference sequences should have same length in alignment n = len(hsp.aln[0].seq) assert len(hsp.aln[1].seq) == n # alignment should not start or end with an insertion assert hsp.aln[0].seq[0] != "." assert hsp.aln[0].seq[n - 1] != "." while i < n: aa_frame = [] codon_frame = [] hxb2 = coords.loc[hmm, "hxb2"] aa = hsp.aln[1].seq[i] if aa != "-": aa_frame.append(aa) codon_frame.append(seq[read:read + 3]) read += 3 # Extend frame with insertions relative to HMM while i < (n - 1) and hsp.aln[0].seq[i + 1] == ".": aa_frame.append(hsp.aln[1].seq[i + 1]) codon_frame.append(seq[read:read + 3]) assert aa_frame[-1] != "-" read += 3 i += 1 # Extend frame with deletions relative to HXB2 for _ in range(coords.loc[hmm, "del"]): if i < (n - 1): aa = hsp.aln[1].seq[i + 1] if aa != "-": aa_frame.append(aa) codon_frame.append(seq[read:read + 3]) read += 3 i += 1 hmm += 1 while i < (n - 1) and hsp.aln[0].seq[i + 1] == ".": aa_frame.append(hsp.aln[1].seq[i + 1]) codon_frame.append(seq[read:read + 3]) assert aa_frame[-1] != "-" read += 3 i += 1 # Assign codons, adjusting coordinates relative to HXB2 insertions assert len(aa_frame) == len(codon_frame) for _ in range(coords.loc[hmm, "ins"] + 1): # Iterate through the codon frame if codon_frame: codon = codon_frame.pop(0) aa = aa_frame.pop(0) if aa != 'X' and 'N' not in codon: assert str(Seq.translate(codon)) == aa.upper() counts[hxb2][codon] = counts[hxb2].get(codon, 0) + count # Deletions occur when the codon frame is empty else: counts[hxb2][""] = counts[hxb2].get("", 0) + count hxb2 += 3 i += 1 hmm += 1 # output lines = [] for hxb2 in sorted(counts): for codon in sorted(counts[hxb2], key=counts[hxb2].get, reverse=True): lines.append("\t".join( [str(hxb2), codon, str(counts[hxb2][codon])])) return lines
if len(list_traits) > 0: print('\nGenomic traits being scanned = ' + str(len(list_traits)) + ':\n') for t in list_traits: print('\t- ' + t) else: print('No genomic annotation was found. Stopping mutation detection...\n') sys.exit() """ CODONS """ # Read original sequence file, and create list of codons dfCodons = pd.DataFrame(columns=['genomes'] + list_traits) # search for insertion in reference genome seq_index = SeqIO.index(alignment, 'fasta') ref_seq = seq_index[ref_genome].seq insert_pos = [pos for pos, nuc in enumerate(ref_seq, 1) if nuc == '-'] insertions = {} if len(insert_pos) > 0: print('\nInsertions detected in the following positions in \"' + ref_genome + '\":\n') cur_idx = 0 cur_ins = 0 lag = 0 print(str(insert_pos)) for i in insert_pos: scanned = [site for list_sites in insertions.values() for site in list_sites] if i > cur_ins + 1: # not consecutive lag = len(scanned)
from Bio import SeqIO (file, id, start, end) = ("secondround_merged_expanded.fasta", "C7136661:0-107", 1, 10) record_dict = SeqIO.index(file, "fasta") print record_dict[id].seq[start:end]
def writeCDNAresults(self, target, target_folder, outf, contigf): """ This is ONLY called when a cDNA target is finished. When doing a cDNA type run, it is very useful to have both the following: 1) All contigs that belong to a gene (isogroup) - It would be particularly good to re-orient these if they are in RC. 2) Total number of reads assembled in each gene (isogroup) Additionally it would be excellent to some day also get the following: 3) Transcript (isotig) structure 4) Estimate of isotig specific reads. """ if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") isotigsf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454IsotigsLayout.txt") readstatusf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454ReadStatus.txt") else: logger.info( "WARNING writeCDNAresults called when assembler was not Newbler" ) return None if not (os.path.exists(contigf) and os.path.exists(isotigsf) and os.path.exists(readstatusf)): logger.info("CDNA WARNING MISSING FILE!! %s %s" % (target, self.params['sample'])) logger.info(contigf, os.path.exists(contigf)) logger.info(isotigsf, os.path.exists(isotigsf)) logger.info(readstatusf, os.path.exists(readstatusf)) return None #Storage data structures: isogroups = { } # A dict of isogroups which each contain an in-order list of contigs readcounts = Counter( ) # A dict of all contigs, these contain read counts (from ReadStatus) contig_orientation = {} contig_to_isogroup = {} contig_idx = SeqIO.index(contigf, "fasta") # Parse isotigsf: igroup = "" #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf for l in open(isotigsf, 'r'): #Handle lines with only a '\n' if l == '\n': pass #Handle lines for isogroup: elif l[0:9] == '>isogroup': igroup = l.strip().split()[0].strip(">") #Handle lines containing all contigs: elif l.strip().split()[0] == 'Contig': l2 = l.strip().split() contigs = map(lambda x: "contig" + x, l2[2:-1]) isogroups[igroup] = contigs for contig in contigs: if contig not in contig_orientation: contig_orientation[contig] = '+' contig_to_isogroup[contig] = igroup else: raise exceptions.FatalError( 'Contig %s in %s more than once' % (contig, contigf)) #Handle lines containing contig orientation info: elif l[0:6] == 'isotig': l2 = l[l.find(" ") + 1:l.rfind(" ") - 1] l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)] for i in range(len(l3)): if l3[i][0] == '<': # contig is in reverse orientation contig = isogroups[igroup][i] contig_orientation[contig] = '-' #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation) #Now parse readstatus: inf = open(readstatusf, 'r') inf.readline() # discard first line for l in inf: l2 = l.strip().split('\t') #Determine if this read was assembled if len(l2) == 8: contig = l2[2] # Note that there are some built in limits to the number of contigs that can be in an isogroup: # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/ # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file. if contig in contig_to_isogroup: readcounts[contig_to_isogroup[contig]] += 1 else: readcounts['ExceedsThreshold'] += 1 #print self.params['sample'], target, "Parse read status" #Finally, output all of this information appropriately: countsf = open( os.path.join(self.params['finished_dir'], "isogroup_read_counts.tsv"), 'a') sample = self.params['sample'] #First write out readcounts: sample \t target \t isogroup \t readcount for isogroup in readcounts: countsf.write('\t'.join( [sample, target, isogroup, str(readcounts[isogroup])]) + '\n') countsf.close() #print self.params['sample'], target, "Wrote readcounts" #Next write the contigs in proper order and orientation: ncontigs = 0 nisogroups = 0 for isogroup in isogroups: nisogroups += 1 for contig in isogroups[isogroup]: ncontigs += 1 seqrec = contig_idx[contig] #print self.params['sample'], target, seqrec if contig_orientation[contig] == '-': seqrec.seq = seqrec.seq.reverse_complement() #print self.params['sample'], target, seqrec seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig #print self.params['sample'], target, seqrec SeqIO.write(seqrec, outf, "fasta") ## TODO: add support for the ExceedsThreshold contigs logger.info( "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups " % (self.params['sample'], target, self.params['iteration'], ncontigs, nisogroups))
proteinID=row[2], PSMcount=int(row[3]), ratio=row[4], error=row[5], cluster=row[6]) peptide.length = len(peptide.seq) if gene not in gene_peparray: gene_peparray[gene] = [peptide] else: gene_peparray[gene].append(peptide) print 'there are', len(gene_peparray), 'genes in total' input_file.close() print 'mapping.....' record_dict = SeqIO.index('varseq.fa', 'fasta') handle = open('splicingvar.txt') gene_variant = {} variant_dic = {} variant_exon = {} for line in handle: row = line.strip().split("\t") exon = EXON(gene=row[1], chr="chr" + row[2], strand=row[3], variant=row[4], number=row[5], start=int(row[6]), end=int(row[7]), trans_start=int(row[8]),
if higher == lower: higher = lower + 1 #print('higher:'+str(higher)+',lower:'+str(lower)); if higher < 100: val = posweight[lower] * (nfrac - lower) + posweight[higher] * ( higher - nfrac) else: val = posweight[99] kweight += val rlenweight.append(kweight) bedfile = sys.argv[-2] reffile = sys.argv[-1] #ofastafile=sys.argv[-1]; # build reference seqref = SeqIO.index(reffile, 'fasta') refkeys = list(seqref.keys()) # read bed file, and ready for writing if bedfile != "-": fid = open(bedfile) else: fid = sys.stdin #ofid=open(ofastafile,'w'); ofid = sys.stdout nlines = 0 prevchr = '' previndex = ''
for u, v, d in g.edges_iter(data=True): if d['weight'] <= args.weight: g.remove_edge(u, v) for u in g.nodes(): if g.degree(u) == 0: g.remove_node(u) print 'Removed edges of {0} weight or less: {1}/{2}'.format( args.weight, g.order(), g.size()) if args.enzyme: RESTRICTION_BATCH = RestrictionBatch([args.enzyme]) ENZYME = RestrictionBatch.get(RESTRICTION_BATCH, args.enzyme, add=False) seqidx = SeqIO.index(args.fasta, 'fasta') attrib = {} for si in seqidx: if g.has_node(si): attr = {'length': len(seqidx[si])} if args.enzyme: seq_sites = count_sites(seqidx[si]) seq_sites = seq_sites if seq_sites > 0 else 1 attr['site'] = seq_sites with open(args.cover, 'r') as cov_h: for l in cov_h: if l.startswith('#ID'): continue
def strip_introns(fasta, verb=None, test=False, min_intron_len=35, max_intron_len=10000, multi_species=False, peptide=''): # want the chrom (refers to coordinates) intron_file = '{}_introns_1.FASTA'.format(fasta[:-6]) p_head = '' if peptide != '': peptide_dict = SeqIO.index(peptide, "fasta", key_function=get_pep_id) p_head = ' pep' headline = '# id chr beg end str n/m len gc ambig?{} seq\n'.format(p_head) enough_introns = False don_motif = {} acc_motif = {} dinuc_motif = {} dinuc_dist = {} with open(fasta) as handle: o = open(intron_file, 'w') o.write(headline) example = 0 don = {} acc = {} dinuc = {} for seq_record in SeqIO.FastaIO.FastaIterator(handle, title2ids=get_exon_id): if verb: print("Seq Record: " + seq_record.name) chrom = re.match('.+chr_name1="([^"]+)"', seq_record.description).group(1) if 'scaffold' in chrom: if verb: print('Scaffolding skipped!') continue exon_positions = {} pos = ['beg', 'end'] r = seq_record.name.split('|') for i in range(2): exon_positions[pos[i]] = [int(x) for x in r[i].split(';')] strand = int( re.match('.+gene_chrom_strand="([^"]+)"', seq_record.description).group(1)) species = re.match('.+organism_name="([^"]+)"', seq_record.description).group(1) if verb: print('strand: ', strand) start = int( re.match('.+transcript_chrom_start="([^"]+)"', seq_record.description).group(1)) intron_count = len(exon_positions['beg']) - 1 # Is this right? if verb: print('Exons:') for i in range(0, intron_count + 1): print('{} - b: {} e: {}'.format(i + 1, exon_positions['beg'][i], exon_positions['end'][i])) # print ('There should be {} introns.'.format(intron_count)) intron_positions = {'beg': [], 'end': []} if verb: print('Introns: ') for i in range(1, intron_count + 1): # Strand represented by 1 or -1 # if strand > 0: intron_positions['beg'].append(exon_positions['end'][i - 1] + 1) intron_positions['end'].append(exon_positions['beg'][i] - 1) # else: # intron_positions['beg'].append(exon_positions['end'][i] + 1) # intron_positions['end'].append(exon_positions['beg'][i-1]-1) if verb: for i in range(0, intron_count): print('{} - b: {} e: {}'.format( i + 1, intron_positions['beg'][i], intron_positions['end'][i])) # return intron_positions # Is this all I want? Won't work with # per transcript loop introns = [] for i in range(0, intron_count): # intron = '' if strand > 0: intron = seq_record.seq[intron_positions['beg'][i] - start:intron_positions['end'][i] - start] else: intron = seq_record.seq[intron_positions['beg'][i] - start:intron_positions['end'][i] - start] # intron = seq_record.seq[intron_positions['end'][i] - # start:intron_positions['beg'][i] - # start] intron = intron.reverse_complement() introns.append(intron) if verb: print('The introns of {} are '.format(seq_record.id)) for x in introns: print(str(x)) # Gather further info for output strand = int( re.match('.+gene_chrom_strand="([^"]+)"', seq_record.description).group(1)) if strand > 0: strand_sym = '+' else: strand_sym = '-' # Output s = 1 if species not in don: don[species] = [] acc[species] = [] dinuc[species] = [] dinuc_motif[species] = [] for x in introns: # If intron is not anomalous... if not (len(x) > max_intron_len or len(x) < min_intron_len): # Setting up donor and acceptor tables # upper is good??? don[species].append(x.upper()[:don_len]) acc[species].append(x.upper()[-acc_len:]) dinuc[species].extend(dinucs(x)) beg = intron_positions['beg'][s - 1] end = intron_positions['end'][s - 1] l = abs(end - beg) intron_set = '{}/{}'.format(s, intron_count) order = [ seq_record.id, species, chrom, str(beg), str(end), strand_sym, intron_set, str(l) ] order.extend(analyze_intron(x)) if peptide != '': pep_id = get_pep_id(seq_record.description) order.append(str(len(peptide_dict[pep_id]))) order.append(str(x)) o.write('\t'.join(order) + '\n') s += 1 example += 1 if example > 4 and test: break # delete output file if not enough_introns? o.close() for species in don: don_motif[species] = motifs.create(don[species]) acc_motif[species] = motifs.create(acc[species]) # dinuc_motif[species] = motifs.create(dinuc[species]) dinuc_dist[species] = {} for di in dinuc[species]: try: dinuc_dist[species][di] += 1 except KeyError: dinuc_dist[species][di] = 1 with open(intron_file) as out1: intron_file_2 = '{}_introns_2.FASTA'.format(fasta[:-6]) out2 = open(intron_file_2, 'w') if peptide != '': headline = '# id chr beg end str n/m len gc' +\ ' ambig? pep don acc 2mer seq\n' else: headline = '# id chr beg end str n/m len gc' + \ ' ambig? don acc 2mer seq\n' out2.write(headline) lines = out1.readlines() good_ones = 0 for line in lines[1:]: intron = line.split()[-1] if len(intron) > max_intron_len or len(intron) < min_intron_len: continue species = line.split()[1] good_ones += 1 d = score_site(Seq(intron[:don_len], don_motif[species].alphabet), don_motif[species]) a = score_site(Seq(intron[-acc_len:], acc_motif[species].alphabet), acc_motif[species]) di_score = score_dinucleotides(intron, dinuc_dist[species]) order = ('\t'.join(line.split()[:-1]), d, a, di_score, intron) out2.write('\t'.join(order) + '\n') out2.close() if len(lines) == 0: print('Requires Python 3 for additional processing') else: print('Processed {} good introns out of {}'.format( good_ones, len(lines) - 1))
for i in SeqIO.parse(inputfile, fmt): if len(i.seq) > 50000: seqlist = list(chunkstring(i.seq, 50000)) c = -1 for t in seqlist: c = c + 1 print i.id, c + 1 f2.write(">" + i.id + "-part-" + str(c + 1) + "\n" + str(t) + "\n") else: f2.write(">" + i.id + "\n" + str(i.seq) + "\n") f2.close() inputfile2 = open('/OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa' % (r2), 'r') count = SeqIO.index('/OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa' % (r2), fmt) c1 = len(count) if threads > c1: chunksize = c1 threads = c1 else: chunksize = int(c1 / threads) + 1 seqs = [] print "threads=", threads print "input file=", c1 print "Chunksize =", chunksize for i in SeqIO.parse(inputfile2, fmt):
def pull_records(emp_fasta, parsing_list): """ Function to write sequences to new fasta file based on the blast coordinates retrieved for those sequences. """ print( "\n---------------------------------------------------------------------------\n" ) print("Writing sequences with updated coordinates...\n") #load the fasta file using the indexing function #will create a dictionary with accn as keys record_dict = SeqIO.index(emp_fasta, "fasta") #create set of accession numbers from empirical fasta emp_accs = set(list(record_dict.keys())) #initiate empty set to populate with accession for seqs extracted extracted_accs = set() #initiate output files autoname = "{}_extracted.fasta".format(emp_fasta.split('.')[0]) logname = "Log_File_{}.txt".format(emp_fasta.split('.')[0]) badname = "Log_BadSeqs_{}.fasta".format(emp_fasta.split('.')[0]) with open(logname, 'a') as fh_log: fh_log.write( "Accn\tOriginal_Length\tRetained_Length\tCoordinates_Used\n") with open(autoname, 'a') as fh_out: #initiate count for records sliced and written count = int(0) #iterate over list with accns and coordinates for item in parsing_list: extracted_accs.add(item[0]) #test if number of accns processed divisible by 100, if so print number #ie an on-screen progress report if count != int(0) and count % int(100) == 0: print("\t\tFinished writing {:,} updated records...".format( count)) #look up the sequence using the accn number as the dictionary key fullseq = record_dict[item[0]].seq seq_parts = [] #iterate over parsing list where coordinate lists would start (index 1) #if only one list, fine, but if multiple we need to examine them to #pull out the appropriate parts of the sequence of interest for coord in item[1:]: #trim sequence if necessary, convert seq object to string #which allows for concatenation later (if multiple intervals present) seqslice = str(fullseq[coord[0]:(coord[1] + int(1))]) seq_parts.append(seqslice) #if there are multiple sequence sections, join them here #if only one item in list will not throw an error newseq = "".join(seq_parts) #write information to log file with open(logname, 'a') as fh_log: fh_log.write("{0}\t{1}\t{2}\t{3}\n".format( item[0], len(fullseq), len(newseq), item[1:])) #write to updated fasta file fh_out.write(">{}\n{}\n".format(record_dict[item[0]].description, newseq)) #add to counter count += 1 #write file of sequences that failed badseqs = emp_accs - extracted_accs if len(badseqs) >= 1: with open(badname, 'a') as fh_out: for acc in badseqs: fh_out.write(">{}\n{}\n".format(record_dict[acc].description, (record_dict[acc].seq))) print("\nWrote a total of {0:,} sequences to {1}.".format(count, autoname)) if len(badseqs) >= 1: print( "{0:,} starting sequence(s) did not pass similarity filtering and are written to {1}.\n\n" .format(len(badseqs), badname)) elif len(badseqs) == int(0): print("All starting sequences passed similarity filtering!\n\n")