def read_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares read QueryResults after it has been written to a file.""" source_qresult = SearchIO.read(source_file, source_format, **kwargs) SearchIO.write(source_qresult, out_file, out_format, **kwargs) out_qresult = SearchIO.read(out_file, out_format, **kwargs) self.assertTrue(compare_search_obj(source_qresult, out_qresult))
def fetch_identifier(cls, sequence=None, filename=None): """ Given a sequence or a XML filename get the NC data. If sequence and filename are specified it runs a blast and write to filename. If sequence is not specified it read the filename. :param sequence: a protein sequence :param filename: the XML filename. :return: """ ## read xml assert sequence or filename, 'You need to specify at least a sequence or a filename' if sequence: if hasattr(sequence, 'seq'): result_handle = NCBIWWW.qblast("blastp", "nr", sequence.seq) else: result_handle = NCBIWWW.qblast("blastp", "nr", sequence) if filename: with open(filename, 'w') as w: w.write(result_handle.read()) xml_result = SearchIO.read(filename, 'blast-xml') else: xml_result = SearchIO.read(result_handle, 'blast-xml') else: xml_result = SearchIO.read(filename, 'blast-xml') ## parse xml identifiers = cls._get_identifers(xml_result) cls.debugprint('There are {} identifiers: {}'.format( len(identifiers), identifiers)) for identifier in identifiers: entry = cls._try_identifer(identifier) if entry: cls.debugprint( 'This identifier {} has a hit'.format(identifier)) cls._fetch_protein(identifier) return entry
def get_srch_file_info(search_result_path): """Returns the format string necessary for the SearchIO module to parse a given file. """ # Define regular expressions that will only match any line in a file if it # is of a certain format. See the SearchIO documentation here: # http://biopython.org/DIST/docs/api/Bio.SearchIO-module.html fmt_expr_dict = { 'blast-xml': [re.compile(r'^<!DOCTYPE BlastOutput')], 'hmmer3-tab': [re.compile(r'^# Option settings: hmmsearch --tblout')], 'hmmer3-text': [re.compile(r'^# HMMER ')] } # See if any of the regular expressions match the file. ident_fmt = None with open(search_result_path) as srh: for line in srh: for fmt in fmt_expr_dict.keys(): for r in fmt_expr_dict[fmt]: if r.search(line) is not None: ident_fmt = fmt break if ident_fmt is not None: break # Check that a format was found. assert ident_fmt is not None, """Error: Could not identify format for search result file: %s""" % search_result_path ident_prog = None with open(search_result_path) as srh: ident_prog = SearchIO.read(srh, ident_fmt).program assert ident_prog is not None, """Could not identify program name from file format type.""" # Decide which dict key to use based on identified program version, and # then identify version in file. ident_vers = None #vers_key = None with open(search_result_path) as srh: ident_vers = SearchIO.read(srh, ident_fmt).version # Check that the program version was found. assert ident_vers is not None, """Error: Could not identify name of program used to generate search result file: %s""" % search_result_path # Return a list defining the input file format. return [ident_prog, ident_vers, ident_fmt]
def append_known_inversions(ir): # BLAST against the db ir_seq = SeqRecord(seq=Seq(ir, generic_nucleotide).upper(), id='IRSEQ') with open('ir.fasta', 'w') as o: SeqIO.write(ir_seq, o, 'fasta') blast('ir.fasta', dbname='SWITCH', out_xml='ir.xml', prog='blastn') blast_result = SearchIO.read('ir.xml', 'blast-xml') best_hit = 'None detected' if blast_result.hits: bh = blast_result.hits[0] hsp = bh.hsps[0] best_hit_name = bh.blast_id evalue = hsp.evalue hit_strand = hsp.hit_strand hit_range = hsp.hit_range mod = '' if hit_strand == -1: mod = '(reverse strand)' best_hit = '{0}: {1} eval:{3} {2}'.format(best_hit_name, hit_range, mod, evalue) # print(blast_result.hits[0]) # print(blast_result.hsps[0]) return best_hit
def makeDb(self): #self.sc.createFolder(self.intermediateDb) newAmbDb = self.resourcePath("/" + self.newDb + "/" + self.dbName) if os.path.exists(newAmbDb) and os.path.isdir(newAmbDb): shutil.rmtree(newAmbDb) for bases, dirs, files in os.walk(self.filesPath): #SI NO CREA BIEN LA BBDD AMBIGUA VER DE DESCOMENTAR ESTO QUE CREABA UNA CARPETA DEMAS SIN USO PARA LAUEBA # newFolderPath = self.newDb + "/" + bases #self.sc.createFolder(newFolderPath) #print ("crea el directorio " + newFolderPath) for file in files: # print(file) # por cada archivo de salida que se haya generado en la busqueda, # generar una nueva secuencia fasta por cada uno de los resultados obtenidos outputName = bases + "/" + file blast_qresult = SearchIO.read(outputName, "blast-xml") sequences = self.getSequencesFromBlastResult(blast_qresult) sequencePath = self.resourcePath("/" + self.newDb + "/" + self.dbName + "/" + file) self.sc.createFolder(sequencePath) self.sc.saveSequencesInFile(sequencePath, sequences, file) db = sequencePath self.sc.setOutputFile(file) #para evitar que se genere mal alguna base de datos y el error aparezca en etapas posteriores while (self.testDbFails(db, file)): self.sc.makeBlastDb(db) shutil.rmtree(self.resourcePath("/Test" + "/" + self.dbName))
def assign_class(self, seq_record): """Classifies sequence as BCR, TCR, or MHC Args: seq_recored: A biopython sequence record object Returns: The receptor and chain type of input sequence, if available """ with tempfile.NamedTemporaryFile(mode="w") as hmm_out: receptor, chain_type = None, None self.run_hmmscan(seq_record, hmm_out) hmmer_query = SearchIO.read(hmm_out.name, 'hmmer3-text') hit_table, top_descriptions = self.parse_hmmer_query(hmmer_query) try: score = int(hit_table[1][3] - 100) except: score = int(0 - 100) receptor, chain_type = self.get_chain_type(top_descriptions) # We have no hits so now we check for MHC and IgNAR # This avoids excessive computations if not receptor or not chain_type: if self.is_b2m(seq_record): return ("B2M", "-", 0) if self.is_ignar(seq_record): return ("BCR", "IgNAR", 0) mhc_I_score = None mhc_I_score = self.is_MHC(str(seq_record.seq), self.mhc_I_hmm) if mhc_I_score >= self.hmm_score_threshold: return ('MHC-I', 'alpha', int(mhc_I_score - self.hmm_score_threshold)) else: mhc_II_alpha_score = None mhc_II_alpha_score = self.is_MHC(str(seq_record.seq), self.mhc_II_alpha_hmm) if mhc_II_alpha_score and mhc_II_alpha_score >= self.hmm_score_threshold: return ('MHC-II', 'alpha', mhc_II_alpha_score - self.hmm_score_threshold) else: mhc_II_beta_score = None mhc_II_beta_score = self.is_MHC( str(seq_record.seq), self.mhc_II_beta_hmm) if mhc_II_beta_score and mhc_II_beta_score >= self.hmm_score_threshold: return ('MHC-II', 'beta', int(mhc_II_beta_score - self.hmm_score_threshold)) else: if mhc_II_alpha_score == 0 and mhc_II_beta_score == 0: return (None, None, score) if mhc_II_alpha_score >= mhc_II_beta_score: return (None, None, int(mhc_II_alpha_score - self.hmm_score_threshold)) else: return (None, None, int(mhc_II_beta_score - self.hmm_score_threshold)) else: return (receptor, chain_type, score)
def blast(): fasta = open("assembly.fasta").read() handle = NCBIWWW.qblast("blastn", "nr", fasta, entrez_query='"Herpesviridae"[organism]' ) #run blast against the assembled sequence with open("blast.xml", "w") as out_handle: out_handle.write(handle.read()) out_handle.close() blast_qresult = SearchIO.read("blast.xml", "blast-xml") output = open('MiniProject.log', 'a') output.write('seq_title ' + 'align_len ' + 'number_HSPs ' + 'topHSP_ident ' + 'topHSP_gaps ' + 'topHSP_bits ' + 'topHSP_expect \n') max_blast_id = 10 if len( blast_qresult ) < 10: #prevents program from crashing when there are less than 10 results max_blast_id = len(blast_qresult) for i in range(0, max_blast_id): hit = blast_qresult[i] blast_hsp = blast_qresult[i][0] output.write( str(hit.description) + ' ' + str(hit.seq_len) + ' ' + str(len(hit.hsps)) + ' ' + str(blast_hsp.ident_num) + ' ' + str(blast_hsp.gap_num) + ' ' + str(blast_hsp.bitscore) + ' ' + str(blast_hsp.evalue) + '\n') output.close()
def blaster(fasta_file): """ Based on a target species list, we BLAST the given input sequence and put them in a file. """ fasta_string = open(fasta_file).read() print("BLAST initiated...") # qblast opens up the BLAST function in NCBI. result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string) print("BLAST search done.") # Records will then be written in a file. records = [] # Results need to go into an XML file. with open("my_blast.xml", "w") as out_handle: out_handle.write(result_handle.read()) blast_result = SearchIO.read("my_blast.xml", "blast-xml") print("Writing BLAST results to file..") for i in target_species: # Interate through the blast result hits. for hit in blast_result: print(hit) if i in hit.description: # If the taret species is found, append. records.append(hit[0].hit) # Pretty easy way to write the given sequences in one file. SeqIO.write(records, "blast-results.fasta", "fasta") print("\nBLAST result file written to blast_results.fasta.") return ("blast_results.fasta")
def extract_faa_seqs(HMM_TO_USE): HMM_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".out" HMM_OUTPUT_OBJECT = SearchIO.read(HMM_OUTPUT_FILE, 'hmmer3-tab') FAA_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".faa" if HMM_OUTPUT_OBJECT: DICT_OF_HIT = dict() for seq_record in SeqIO.parse(ORF_FILE, "fasta"): for hit in HMM_OUTPUT_OBJECT: # Compare the seq record from the fasta file to the IDs in the hits if seq_record.id == hit.id: BIN_ID = G2B_DICT[hit.id]['binID'] # If binID has been added to dictionary, keep the one with the higher bitscore if BIN_ID in DICT_OF_HIT: print(BIN_ID + " has multiple hits for " + HMM_TO_USE) if int(hit.bitscore) > int( DICT_OF_HIT[BIN_ID]['bitscore']): DICT_OF_HIT[BIN_ID] = { "sequence": seq_record.seq.rstrip('*'), "bitscore": hit.bitscore } # If binID hasn't been added to dictionary, add it. if BIN_ID not in DICT_OF_HIT: DICT_OF_HIT[BIN_ID] = { "sequence": seq_record.seq.rstrip('*'), "bitscore": hit.bitscore } with open(FAA_OUTPUT_FILE, 'w') as OPENED_FAA_OUTPUT: for bin_id in DICT_OF_HIT: OPENED_FAA_OUTPUT.write('>' + bin_id + '\n' + str(DICT_OF_HIT[bin_id]['sequence']) + '\n')
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None): ''' Performs reciprocal hmmer search against the proteome of given organism ''' print "# Reciprocal search..." is_found = False if out_filename == None: out_filename = modelname + ".rechits_" + organism reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res" os.system(reciprocal_search_command) try: hits = SearchIO.read(out_filename, "hmmer3-tab") max_bitscore = hits[0].bitscore except: hits = [] max_bitscore = 0 if len(hits) > 0: if re.search(modelname_regex, hits[0].description): is_found = True # for h in hits: # if h.bitscore > rev_inc_bitscore_percentage * max_bitscore: # if manual_mode: # print modelname_regex, h.description, re.search(modelname_regex, h.description) # raw_input("...") # if re.search(modelname_regex, h.description): # is_found = True if manual_mode: print filename, is_found raw_input("Check reciprocal search results...") return is_found
def process_blastn(filename, loci_name): blastn_qresult = SearchIO.read(filename, 'blast-xml') if len(list(blastn_qresult.hits)) == 0: print("No match found") return None best_hit = blastn_qresult[0] best_hit_species = best_hit.description.split(' ')[0] best_hit_genus = best_hit.description.split(' ')[1] species_same_genus = set([each_hit.description.split(' ')[0] + " " + each_hit.description.split(' ')[1] for each_hit in blastn_qresult.hits if best_hit_genus in each_hit.description]) blast_hsp = blastn_qresult[0][0] with open("Output_" + filename, 'w') as output_handle: output_handle.write("--------------------------------------------------------------------\n") output_handle.write("Result Summary\n") output_handle.write("--------------------------------------------------------------------\n") output_handle.write("Sample name : %s\nLoci name : %s\n" % (filename.split('_')[0], loci_name)) output_handle.write("Estimated species : %s\n" % (best_hit_genus + " " + best_hit_species)) output_handle.write("Other specie(s) with same genus : ") output_handle.write(str(species_same_genus)+'\n') if blast_hsp.hit_strand != blast_hsp.query_strand: output_handle.write("Reverse complemented? : Yes\n") else: output_handle.write("Reverse complemented? : No\n") output_handle.write("--------------------------------------------------------------------\n") output_handle.write("Hit summary\n") output_handle.write("--------------------------------------------------------------------\n") output_handle.write(str(blast_hsp)) output_handle.write('\n') output_handle.write("--------------------------------------------------------------------\n") output_handle.write("Alignment detail\n") output_handle.write("--------------------------------------------------------------------\n") output_handle.write("%s - %s\n" % (blast_hsp.aln_all[0][0].seq, blast_hsp.aln_all[0][0].id)) output_handle.write(blast_hsp.aln_annotation['similarity']+'\n') output_handle.write("%s - %s\n" % (blast_hsp.aln_all[0][1].seq, blast_hsp.aln_all[0][1].id))
def generate_protein_model(self, query: str, template: str, blast_xml_path: str, out_dir: str, template_dir: str): hits = [ _ for _ in SearchIO.read(blast_xml_path, 'blast-xml').hits if _.id == template ] assert len(hits) == 1 best = hits[0].hsps[0].aln tseq = replace_missing_residues(str(best[1].seq), f'{template_dir}/{template}.ent') Path(out_dir).mkdir(parents=True, exist_ok=True) pir_file = f'{out_dir}/{template}.pir' SeqIO.write([ SeqRecord(Seq(str(best[0].seq), generic_protein), id=query, name='', description=f'sequence:{query}::::::::'), SeqRecord( Seq(tseq, generic_protein), id=template, name='', description= f'structureX:{template}::{template[5].upper()}::{template[5].upper()}::::' ) ], pir_file, 'pir') arg = [ self.modpysh, 'python3', Path(__file__).parent.resolve() / 'modeller_script.py', pir_file, template, query, template_dir ] subprocess.run(arg, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
def read_xml(): """Reads the blast_result xml file and returns the contents. This function reads the blast_result xml file containing the results of a single BLAST search. Of every hit, the score, percentage query coverages, percentage identity, percentage positives, expect value and protein accession codes are stored in lists and returned. """ score = [] query_cover = [] identity = [] positives = [] evalue = [] protein_codes = [] blast_qresult = SearchIO.read('blast_result.xml', 'blast-xml') for i in range(len(blast_qresult)): score.append(blast_qresult[i][0].bitscore) query_cover.append( round((float(blast_qresult[i][0].query_span) / float(blast_qresult[i][0].query_end) * 100), 1)) identity.append( round((float(blast_qresult[i][0].ident_num) / float(blast_qresult[i][0].hit_span) * 100), 1)) positives.append( round((float(blast_qresult[i][0].pos_num) / float(blast_qresult[i][0].hit_span) * 100), 1)) evalue.append(blast_qresult[i][0].evalue) protein_codes.append(blast_qresult[i].accession) return score, query_cover, identity, positives, evalue, protein_codes
def blast_search(filename, blast_temp_path): ''' Perform a blast search, using a file as input. Either FASTA or accession number Return results as list of accession numbers and locations ''' filepath = "%s/%s.xml" % (blast_temp_path, filename) if not os.path.exists(filepath): with open('%s/%s' % (blast_temp_path, filename), 'w') as f: f.write(filename) search_cmd = NcbitblastnCommandline(query='%s/%s' % (blast_temp_path, filename), db="/research/sequences/GenBank/blast/db/refseq_genomic", outfmt=5, out=filepath) subprocess.call(str(search_cmd), shell=True) result = SearchIO.read(filepath, 'blast-xml') # Filter e-value hsps = filter(lambda hit: hit.evalue < 1e-10, result.hsps) # Filter length hsps = filter(lambda hit: hit.aln_span > 209, hsps) # Filter identity iden_cutoff = 0.2 hsps = filter(lambda hit: float(hit.ident_num)/float(result.seq_len) > iden_cutoff, hsps) return [(hit.hit_id.split('|')[-2], hit.hit_start, hit.hit_end) for hit in hsps]
def analyze_BLAST_result(input_fasta_name_wo_path, result_handle): show_header("Step 2. Analyzing the BLAST result.") output_file_name = "retrieved_from_" + str( input_fasta_name_wo_path)[:-6] + ".xml" if not os.path.exists('sample/output'): os.makedirs('sample/output') current_dir = os.getcwd() output_folder = os.path.join(current_dir, "sample/output") os.chdir(output_folder) output_file = open( output_file_name, "w" ) # since it is 'w', an existing file will be overwritten. (if this is "a", new info will be appended to an existing file) output_file.write(result_handle.read()) output_file.close() blast_qresult = SearchIO.read(output_file_name, "blast-xml") # query_result filter_for_no_predicted_hypothetical = lambda hit: ("PREDICTED" in hit. description == False) filtered_qresult = blast_qresult.hit_filter( filter_for_no_predicted_hypothetical) for hit in filtered_qresult: print("%s" % (hit.description))
def xml2fasta(infile=None, outfile=None): print('\nConverting ' + infile + ' to fasta format, removing duplicates...') # Load the blast output file blast_qresult = SearchIO.read(infile, "blast-xml") # Iterate through ids and sequences and add them to lists. Sequences are # only added to the list if they are not already in the list. This is done # because sometimes there will be duplicates in the blast output, which # will produce an error when aligning with clustal ids = [] sequences = [] for hsp in blast_qresult.hsps: if hsp.hit.id not in ids: ids.append(str(hsp.hit.id)) sequences.append(str(hsp.hit.seq)) # Open the sequences output file then for each high-scoring pair # in the blast results, write the hit ID (proceeded by a ">" for # fasta format), followed by the hit sequence on the next line with open(outfile, "w") as f: for i, s in zip(ids, sequences): f.write('> ' + i + '\n') f.write(s + '\n') print('\tDone: writing to ' + outfile)
def find_similar_region_for_vntr(sema, reference_vntr, ref_file, result_list): from Bio import SearchIO vntr_id = reference_vntr.id q = reference_vntr.left_flanking_region[ -30:] + reference_vntr.pattern + reference_vntr.right_flanking_region[: 30] search_index = vntr_id qfile = settings.BLAST_TMP_DIR + str(vntr_id) + '_' + str( search_index) + '_query.fasta' with open(qfile, "w") as output_handle: my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(q), id='query', description='') SeqIO.write([my_rec], output_handle, 'fasta') output = 'blat_out/output_%s_%s.psl' % (vntr_id, search_index) command = 'blat -q=dna -oneOff=1 -tileSize=8 -stepSize=3 -minIdentity=75 %s %s %s' % ( ref_file, qfile, output) os.system(command) os.system('rm %s' % qfile) try: qresult = SearchIO.read(output, 'blat-psl') if is_false_vntr_hit(qresult, reference_vntr): print('there is similar sequence for %s' % vntr_id) result_list.append(vntr_id) except ValueError: pass sema.release()
def __init__(self,Maxicircle,out_file): file_out=open(out_file,'w') writer = csv.writer(file_out,delimiter="\t") writer.writerow(["##gff-version","3"]) rows=[] for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"): output_file=protein.split("/")[-1]+".xml" blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein, outfmt=5, out=output_file) blastx_cline() blast_qresult = SearchIO.read(output_file, 'blast-xml') if len(blast_qresult)>0: best=blast_qresult[0][0] query_range=[x for x in best.query_range] if best.query_strand>0: query_strand="+" else: query_strand="-" chromosome=best.query_id rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]]) print(str(len(rows))+" exons found") rows=iter(rows) writer.writerows(rows)
def blast(seq): result_handle = NCBIWWW.qblast("blastx", "nr", seq) print(result_handle) # blast_records = NCBIWWW.parse(result_handle) # print (blast_records) with open("derde.xml", 'w') as out_handle: out_handle.write(result_handle.read()) with open("derde.xml", 'r') as out_handle: blast_records = NCBIXML.parse(out_handle) blast_record = next(blast_records) # print(blast_record) E_VALUE_THRESH = 0.04 #max_hsps = 1 for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: print("****Alignment****") title = ("sequence:", alignment.title) # print("length:", alignment.length) # print("e value:", hsp.expect) # print(hsp.query[0:75] + "...") # print(hsp.match[0:75] + "...") # print(hsp.sbjct[0:75] + "...") blast_qresult = SearchIO.read("derde.xml", "blast-xml") blast_hsp = blast_qresult[0][ 0] # alleen informatie van eerste hit word meegenomen print(blast_hsp) return blast_hsp
def get_locus(sequences, kir=False, verbose=False, refdata=None, evalue=10): """ Gets the locus of the sequence by running blastn :param sequences: sequenences to blast :param kir: bool whether the sequences are KIR or not :rtype: ``str`` Example usage: >>> from Bio.Seq import Seq >>> from seqann.blast_cmd import get_locus >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') >>> locus = get_locus(sequence) """ if not refdata: refdata = ReferenceData() file_id = str(randomid()) input_fasta = file_id + ".fasta" output_xml = file_id + ".xml" SeqIO.write(sequences, input_fasta, "fasta") blastn_cline = NcbiblastnCommandline(query=input_fasta, db=refdata.blastdb, evalue=evalue, outfmt=5, reward=1, penalty=-3, gapopen=5, gapextend=2, dust='yes', out=output_xml) stdout, stderr = blastn_cline() blast_qresult = SearchIO.read(output_xml, 'blast-xml') # Delete files cleanup(file_id) if len(blast_qresult.hits) == 0: return '' loci = [] for i in range(0, 3): if kir: loci.append(blast_qresult[i].id.split("*")[0]) else: loci.append(blast_qresult[i].id.split("*")[0]) locus = set(loci) if len(locus) == 1: if has_hla(loci[0]) or kir: return loci[0] else: return "HLA-" + loci[0] else: return ''
def similarity_filter(inp_file, level): blast_qresult = SearchIO.read(inp_file, "blast-xml") table = [] for hsp in blast_qresult: for h in hsp: if float(h.ident_num * 100 / len(h.query)) > int(level): table.append(h) return table
def read_hmmer_file(f_path): """ Uses Biopython's SearchIO to parse a HMMER output file. Returns an iterator with search hits. """ f_path = _check_file(f_path) return SearchIO.read(f_path, 'hmmer3-text')
def read_hmmer_file(f_path): """ Uses Biopython's SearchIO to parse a HMMER output file. Returns an iterator with search hits. """ f_path = _check_file(f_path) return SearchIO.read(f_path, "hmmer3-text")
def check_blast(blast_res): try: with open("ncbi_result.xml", "w") as result_file: result_file.write(blast_res.read()) result = SearchIO.read("ncbi_result.xml", "blast-xml") return result, True except: return "", False
def get_efficiency(reads_file, genome, target): # Bowtie for the uninterrupted insertion site to get approximate integration efficiency # This data was not used in the paper, only for our own checks (not the best method in any case for this data) uninterrupted_reads, other_site_reads, efficiency = None, None, None if target: site = genome.seq.upper().find(target) # add length of target, and the span of +35 to +65 if site > 0: uninterrupted_insertion_site = genome.seq[site + len(target) + 35:site + len(target) + 65] else: site = genome.seq.upper().reverse_complement().find(target) uninterrupted_insertion_site = genome.seq.reverse_complement( )[site + len(target) + 35:site + len(target) + 65] uninterrupted_fasta = f'tmp/uninterrupted.fasta' SeqIO.write(SeqRecord(uninterrupted_insertion_site, id="target"), uninterrupted_fasta, 'fasta') blast_filename = f'tmp/uninterrupted_blastresults.xml' do_blast(uninterrupted_fasta, reads_file, blast_filename) # We are querying the 30bp sequence around the insertion site for WT sequence res = SearchIO.read(blast_filename, 'blast-xml') hits = [ hit for hit in res.hits if len(hit.hsps) == 1 and hit.hsps[0].ident_num > 28 ] uninterrupted_reads = len(hits) # also do a blast for another random site in the genome to compare other_site = site - 5000 if site > 20000 else site + 5000 SeqIO.write( SeqRecord(genome.seq[other_site:other_site + 30], id='site'), 'tmp/other_site.fasta', 'fasta') do_blast('tmp/other_site.fasta', reads_file, 'tmp/other_site_blast.xml') res = SearchIO.read('tmp/other_site_blast.xml', 'blast-xml') hits = [ hit for hit in res.hits if len(hit.hsps) == 1 and hit.hsps[0].ident_num > 28 ] other_site_reads = len(hits) efficiency = (other_site_reads - uninterrupted_reads) / other_site_reads return (uninterrupted_reads, other_site_reads, efficiency)
def test(): E_VALUE_THRESH = 0.001 #blast_search('data/test.fasta', 'blastp', 'swissprot') blast_records = SearchIO.read('output/my_blast.xml', 'blast-xml') records = [] for blast_record in blast_records: records.append(blast_record[0].hit) SeqIO.write(records, 'data/msa.fasta', 'fasta') '''
def find_frameshift(sbjct_dict, query_dict, pseudo_hits, temp_dir, out_frameshift): assert (os.path.isdir(temp_dir)) sys.stderr.write("finding frameshift mutations...\n") frameshifts = [] non_frameshift = [] exn_query = temp_dir + "/" + "exn_query.fa" exn_target = temp_dir + "/" + "exn_target.fa" align_file = temp_dir + "/" + "fshift_exonerate.exn" for hit in pseudo_hits: chrom = hit[0] record = sbjct_dict[chrom] qseqid = hit[8].split(";")[0].split("=")[1] SeqIO.write(query_dict[qseqid], exn_query, "fasta") flank = 1000 flank_record = _get_hit_record(record, hit, flank) SeqIO.write(flank_record, exn_target, "fasta") # alignment using exonerate p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " + exn_query + " -t " + exn_target + ">" + align_file, shell=True) os.waitpid(p.pid, 0) fshift = False try: qresult = SearchIO.read(align_file, "exonerate-text") hsp = qresult[0][0] # first hit, first hsp # query overlapping with the best-hit new_hit_start = flank + 1 new_hit_end = len(flank_record.seq) - flank if hsp.hit_start + 1 <= new_hit_end and \ hsp.hit_end >= new_hit_start: # there are frameshifts if len(hsp.hit_frame_all) > 1: fshift = True except: pass if fshift: fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end] fshift_id = hit[0] + ":" + \ str(hsp.hit_start + 1) + "-" + str(hsp.hit_end) frameshift_record = SeqRecord(fshift_seq, id=fshift_id, name=fshift_id, description="qseqid=" + qseqid) frameshifts.append(frameshift_record) else: non_frameshift.append(hit) # end for SeqIO.write(frameshifts, out_frameshift, "fasta") sys.stderr.write("done.\n") return non_frameshift
def process_hmmscan(): """Processes the tab file resulting from a hmmscan search. """ ###parse and return the result using hmmer3-tab format try: result = SearchIO.read(HMMOUT_FILENAME, 'hmmer3-tab') return (result) except ValueError: return ([])
def get_gene_homologous(gene: str, output_dir: str, limit: int = 100): from Bio import SeqIO from Bio.Blast import NCBIWWW path = Path(output_dir) path.mkdir(exist_ok=True) blast_results_filename = f'{output_dir}/results.xml' if not Path(blast_results_filename).exists(): rec = next(SeqIO.parse(gene, 'fasta')) print('[*] Blastp-ing HBA1, please wait...') result_handle = NCBIWWW.qblast('blastp', 'nr', rec.seq, hitlist_size=1000) with open(blast_results_filename, 'w') as save_file: blast_results = result_handle.read() save_file.write(blast_results) print(f'[*] Blastp-ed, results at {blast_results_filename}') from Bio import SearchIO results = SearchIO.read(blast_results_filename, 'blast-xml') Path(f'{output_dir}/fastas').mkdir(exist_ok=True) organisms = set() seqs_files = [] def good_org(org_str: str): return ( org_str and org_str not in organisms and org_str != 'synthetic construct' and # 'bacter' not in org_str and ### uncomment to exclude bacterias 'unclassified' not in org_str) for hit in results.hits: orgs = re.findall(r"\[([A-Za-z ]+)\]", hit.description) org = orgs[0] if orgs else None if not good_org(org): continue organisms.add(org) rec = hit.hsps[0].hit rec.description = '' rec.id = org.replace(' ', '_') print(f'{org} = {rec.id}') seq_file = f'{output_dir}/fastas/{rec.id}.fasta' SeqIO.write(rec, seq_file, 'fasta') seqs_files.append(seq_file) # limit species if len(organisms) == limit: break with open(f'{output_dir}/species.txt', 'w') as f: f.write('\n'.join(organisms)) return seqs_files
def find_frameshift(sbjct_dict, query_dict, pseudo_hits, temp_dir, out_frameshift): assert(os.path.isdir(temp_dir)) sys.stderr.write("finding frameshift mutations...\n") frameshifts = [] non_frameshift = [] exn_query = temp_dir + "/" + "exn_query.fa" exn_target = temp_dir + "/" + "exn_target.fa" align_file = temp_dir + "/" + "fshift_exonerate.exn" for hit in pseudo_hits: chrom = hit[0] record = sbjct_dict[chrom] qseqid = hit[8].split(";")[0].split("=")[1] SeqIO.write(query_dict[qseqid], exn_query, "fasta") flank = 1000 flank_record = _get_hit_record(record, hit, flank) SeqIO.write(flank_record, exn_target, "fasta") # alignment using exonerate p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " + exn_query + " -t " + exn_target + ">" + align_file, shell=True) os.waitpid(p.pid, 0) fshift = False try: qresult = SearchIO.read(align_file, "exonerate-text") hsp = qresult[0][0] # first hit, first hsp # query overlapping with the best-hit new_hit_start = flank + 1 new_hit_end = len(flank_record.seq) - flank if hsp.hit_start + 1 <= new_hit_end and \ hsp.hit_end >= new_hit_start: # there are frameshifts if len(hsp.hit_frame_all) > 1: fshift = True except: pass if fshift: fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end] fshift_id = hit[0] + ":" + \ str(hsp.hit_start + 1) + "-" + str(hsp.hit_end) frameshift_record = SeqRecord( fshift_seq, id=fshift_id, name=fshift_id, description="qseqid=" + qseqid) frameshifts.append(frameshift_record) else: non_frameshift.append(hit) # end for SeqIO.write(frameshifts, out_frameshift, "fasta") sys.stderr.write("done.\n") return non_frameshift
def getIndices(resultHandle): '''If not provided directly by the user, this function retrieves the best BLAST hit's indices.''' blast_result = SearchIO.read(resultHandle, 'blast-tab') print(blast_result[0][0]) start = blast_result[0][0].hit_start end = blast_result[0][0].hit_end return start, end
def getIndices(resultHandle): """If not provided directly by the user, this function retrieves the best BLAST hit's indices.""" blast_result = SearchIO.read(resultHandle, "blast-tab") print(blast_result[0][0]) start = blast_result[0][0].hit_start end = blast_result[0][0].hit_end return start, end
def sequence_search(self): '''Perform sequence search :returns: a list of target chains ''' sequence = None # Write sequence file for input chain records = list(SeqIO.parse(self.reference_pdb_file, 'pdb-seqres')) for record in records: if ':' in record.id: (record_pdb_id, record_chain_id) = record.id.split(':') else: (record_pdb_id, record_chain_id) = ('', record.id) if record_pdb_id.lower() == self.reference_id.lower() and record_chain_id == self.reference_chain_id: sequence = record break if sequence is None: raise IndexError('Sequence not found for protein chain {}'.format(self.reference_chain_id)) # Write query sequence file with open(os.path.join(self.output_dir, 'query.fasta'), 'w') as output_handle: SeqIO.write(sequence, output_handle, 'fasta') # Run FASTA '-b', str(self.args.maximum_sequences), args = ['-q', '-m', '10', os.path.join(self.output_dir, 'query.fasta'), self.args.sequence_file] # Read alignment output with open(os.path.join(self.output_dir, 'output.fasta'), 'w') as out: call([self.args.sequence_search_tool] + args, stdout=out) target_chains = [] query_result = SearchIO.read(os.path.join(self.output_dir, 'output.fasta'), "fasta-m10") for hit in query_result: (target_pdb_id, target_chain_id) = self.extract_ids(hit.id) hsp = hit[0] if target_pdb_id.lower() == self.reference_id.lower() and target_chain_id == self.reference_chain_id: continue if hsp.ident_pct < self.args.minimum_identity or hsp.ident_pct > self.args.maximum_identity: continue if hsp.pos_pct < self.args.minimum_similarity or hsp.pos_pct > self.args.maximum_similarity: continue target_chains.append({'pdb_id': target_pdb_id, 'chain_id': target_chain_id, 'identity': '{:.2f}'.format(hsp.ident_pct), 'similarity': '{:.2f}'.format(hsp.pos_pct)}) if len(target_chains) >= self.args.maximum_sequences: break return target_chains
def hmmer(query): """Performs hmmsearch with a given gene. returns: tuple with (gene_name, list of hmm hits [hmm1, hmm2 ... hmmn]""" hmm_prof = str(Path(dfo, f'profiles/{query}.hmm')) cmd = f'hmmsearch -E 1e-10 {hmm_prof} {infile} > {args.output}/tmp/{sample_name}/{query}.hmmout' subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL) hits = [] for hit in SearchIO.read(f'{args.output}/tmp/{sample_name}/{query}.hmmout', 'hmmer3-text'): hits.append(hit.id) return query, hits
def run_hmmsearch(in_file, hmm, threads): if which('hmmsearch') is None: exit('[!] hmmsearch not found') print("[>] Running hmmsearch... ", end="", flush=True) cmd = ['hmmsearch', '--noali', '--cut_tc', '--cpu', threads, hmm, '-'] child = Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=PIPE) try: child.stdin.write(in_file.encode()) return SearchIO.read(StringIO(child.communicate()[0].decode('utf-8')), 'hmmer3-text') except IOError: return print(f'\n[!] hmmer failed to run, is your input DNA?')
def runLocalBLAST(genome, genomeDict, genomeFastaF, genomeDatabaseF, workspaceRootDir, sourceIndex, neighbor_num, bitscore_cutoff): # auxiliary files queryF = workspaceRootDir + 'query_active' + str(sourceIndex) + '.fasta' giF = workspaceRootDir + 'girestrict_active' + str(sourceIndex) + '.txt' blastOutF = workspaceRootDir + 'active_blast' + str(sourceIndex) + '.xml' # load sequences reciter = SeqIO.parse(genomeFastaF, 'fasta') for index, aaRecord in enumerate(reciter): # load information, find in genome thisGene = parseNCBIDescription(aaRecord.description) chromIndex = genomeDict[thisGene.gi][0] geneIndex = genomeDict[thisGene.gi][1] # write sequence with open(queryF, 'w') as queryFHandle: SeqIO.write(aaRecord, queryFHandle, 'fasta') # write neighbors neighbors = getNeighbors(chromIndex, geneIndex, neighbor_num, genome) with open(giF, 'w') as giFHandle: giFHandle.write('\n'.join(neighbors) + '\n') # run BLAST blastp_cline = NcbiblastpCommandline(query=queryF, db=genomeDatabaseF, evalue=0.1, outfmt=5, out=blastOutF, gilist=giF, dbsize=neighbor_num, searchsp=neighbor_num) stdout, stderr = blastp_cline() # parse BLAST output blastOutput = SearchIO.read(blastOutF, 'blast-xml') genome[chromIndex][geneIndex].alignments = parseBLASTOut(blastOutput, thisGene.gi, bitscore_cutoff) reciter.close() os.system('rm ' + queryF) os.system('rm ' + giF) os.system('rm ' + blastOutF)
def parseBLAST(blastXML): blast_qresult = SearchIO.read(StringIO(blastXML), 'blast-xml') # initialize output variables num_hits = 0 num_hsps =evalue = qstart = qend = hit_seq = "NA" if (len(blast_qresult)): num_hits = len(blast_qresult) # HIT related data num_hsps = len(blast_qresult[0]) # blast_qresult[0] -> num hsps of first hit # HSP related data ( blast_qresult[0][0] -> first HSP from first HIT) evalue = blast_qresult[0][0].evalue qstart = blast_qresult[0][0].query_start+1 # it's the XML coordinate -1 qend = blast_qresult[0][0].query_end hit_seq = str(blast_qresult[0][0][0].hit.seq) # this is a SeqRecord object converted to string (blast_qresult[0][0][0] -> third level, fragments of the HSP) return(num_hits, num_hsps, evalue, qstart, qend, hit_seq)
def read_blat(path): data = {} for name in glob.glob(path): try: blat_result = SearchIO.read(name, 'blat-psl') except: continue filter_func = lambda hit: hit.query_span >= 200 and (hit.hit_span <= hit.query_span * 1.2 and hit.query_span <= hit.hit_span * 1.2) for hit in blat_result: hit = hit.filter(filter_func) if hit: #print(hit) hsp_hit = [] for hsp in hit: if hsp.query_id != hsp.hit_id: #if hsp.is_fragmented: hsp_hit.append(hsp.query_start) hsp_hit.append(hsp.query_end) print(hsp.query_id, hsp.hit_id, hsp.query_start, hsp.query_end, hsp.hit_start, hsp.hit_end) if len(hsp_hit) > 0: ranges = ["{},{} ".format(hsp_hit[i], hsp_hit[i + 1] - hsp_hit[i]) for i in range(0, len(hsp_hit), 2)] if hsp.query_id in data: data[hsp.query_id] += "".join(ranges) #data[hsp.query_id] += "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit)) else: data[hsp.query_id] = "".join(ranges) #data[hsp.query_id] = "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit)) print(ranges) return data
def filter_forw_hmm_hits(modelname, forw_inc_bitscore_percentage, unique=True): ''' Reads hmmer hits from standart hmmer output. Returns hits with bitscore more than provided max bitscore percentage. By default returns only hits that from unique loci. ''' gene_regex = re.compile(r'GN=([^=]+)') try: hits = SearchIO.read(modelname + ".hits", "hmmer3-tab") except: hits = [] # select the longest isoform filtered_hits = [] try: max_bitscore = hits[0].bitscore except: max_bitscore = 0 unique_genes = [] na_gene_count = 0 for h in hits: # try to guess gene from hit description gene = None for m in gene_regex.finditer(h.description): gene = m.group(1)[:-3] if not gene: gene = "gene" + str(na_gene_count) na_gene_count += 1 #print h.id, gene, na_gene_count if gene not in unique_genes: unique_genes.append(gene) if h.bitscore > max_bitscore * forw_inc_bitscore_percentage: filtered_hits.append(h) if manual_mode: print unique_genes # raw_input("...") return filtered_hits
def parse_queries(query_database, query_species, query_subspecies, query_type): print '\nparsing ...' for file in xml_results: print '\nopening file ...' result_handle = open(file) print '\nXML processing ...' blast_record = NCBIXML.read(result_handle) query_name = '' hit_num = 0 blast_qresult = SearchIO.read(file, 'blast-xml') for hit in blast_qresult: for hsp in hit: if hit_num == 0: query_name = hit.description # calculate the % identity of the hit to the query sequence hit_identity = ((1.0 * hsp.ident_num / hsp.aln_span) * 100) # add accession numbers to the hit_accessions array hit_accessions.append( [hit.accession, hit_identity] ) hit_num += 1 # remove duplicate hits from hit_accessions print '\nConsolidating duplicate hits ...' remove_duplicate_hits(hit_accessions) print '\nDone' # for each accession number in the hit_accessions: # get the FASTA sequence # get the phylogeny print '\nQuerying GenBank ...' all_accessions, full_hit_seqs = query_genbank(query_name, query_database) print '\nOutputting Results ...' output_full_hits(query_name, full_hit_seqs, all_accessions) # remove hits that are not full sequences (sequence length < 16000) TODO print '\nRemoving incomplete sequences ...' # ----------------------------------------- SORTS HITS BY PHYLOGENY # ----------------------------------------------------------------- # sorts the hits by their phylogenetic relation to query # outputs final hit and query data to file directory print '\nSorting hits by Phylogeny ...' sort_by_phylogeny(query_species, query_subspecies, query_database, query_type)
def blast_it (input): try: t0 = datetime.datetime.now() # **** SETTINGS **** bitscore_cutoff = 100 # see calibration description in notes # find clusters using basic ordinal filter: # take max(min(max(left side neighbor genes), max(right side neighbor genes)), self) # then set hard cutoff (maybe >= 3 matches on edges, 6 on peak, 2 on length) # -> cuts off immediately once you're past the edge, but includes everything inside edge_thresh = 2.5 # must be greater than (e.g. 3 or more) peak_thresh = 5.5 length_thresh = 1.5 clust_extend = 0 # take N extra genes on either side just to show window # ********************* # INPUT: sufi = input[0] sourcedir = input[1] timestr = input[2] # Obtain list of files suffix_f = open(sourcedir, 'rU') suffix_r = csv.reader(suffix_f) suffixpaths = [] for row in suffix_r: if len(row) == 0: break suffixpaths.append(row[0]) #print(row[0]) suffix_f.close() suf = suffixpaths[sufi] if suf[-len('.fasta'):] == '.fasta': suf = suf[0:-len('.fasta')] # assemble list of genes on chromosome chromnames = [] chromgenes = [] reciter = SeqIO.parse(suf + '.fasta', 'fasta') for index, aarecord in enumerate(reciter): dparse = bracketparse(aarecord.description, '[]') newgeneobj = gene_obj() newgeneobj.gi = aarecord.description.split(' ')[0].split('|')[1] for dp in dparse: if dp.split('=')[0] == 'chromosome': newgeneobj.chromosome = dp.split('=')[1] continue if dp.split('=')[0] == 'location': newgeneobj.location = [int(bracketparse(dp.split('=')[1], '()')[0].split(',')[0]), int(bracketparse(dp.split('=')[1], '()')[0].split(',')[1])] continue if dp.split('=')[0] == 'direction': newgeneobj.direction = dp.split('=')[1] continue if dp.split('=')[0] == 'description': newgeneobj.description = '='.join(dp.split('=')[1:]) continue if dp.split('=')[0] == 'protein_id': newgeneobj.ncbi_id = dp.split('=')[1] continue if newgeneobj.chromosome not in chromnames: chromnames.append(newgeneobj.chromosome) chromgenes.append([]) chromgenes[-1].append(newgeneobj) reciter.close() # sort genes by location for chromi in range(len(chromnames)): chromgenes[chromi].sort(key=lambda thsgi: thsgi.location[0]) # assemble nearby genes neighbor_num = 10 #** move neighbor_wind = int(round(neighbor_num/2)) # window = 10000 could do this with nt # for chromi in range(len(chromgenes)): for ind, aaobj in enumerate(chromgenes[chromi]): #if suf.find('Streptomyces') == -1: # linear chromosome chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi for thsi in range(max([ind-neighbor_wind, 0]), min([ind+neighbor_wind+1, len(chromgenes[chromi])]))] #else: #circular chromosome # chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi # for thsi in np.mod(range(ind-neighbor_wind, # ind+neighbor_wind+1), len(chromgenes[chromi]))] # run local BLAST db_f = suf + 'BLASTdb' db_root = '/'.join(suf.split('/')[0:-1]) + '/' gis = [] inds = [] for chromi in range(len(chromgenes)): inds += zip([chromi]*len(chromgenes[chromi]), range(len(chromgenes[chromi]))) gis += [thsgene.gi for thsgene in chromgenes[chromi]] aadict = dict(zip(gis, inds)) query_fname = db_root + 'query_active' + str(sufi) + '.fasta' gi_fname = db_root + 'girestrict_active' + str(sufi) + '.txt' out_fname = db_root + 'active_blast' + str(sufi) + '.xml' reciter = SeqIO.parse(suf + '.fasta', 'fasta') chrom_size_zeros = [[]]*len(chromgenes) for chromi in range(len(chromgenes)): chrom_size_zeros[chromi] = np.zeros(len(chromgenes[chromi])) median_hit_len = copy.deepcopy(chrom_size_zeros) hit_num = copy.deepcopy(chrom_size_zeros) pks_nrps_yn = copy.deepcopy(chrom_size_zeros) gi_nums = copy.deepcopy(chrom_size_zeros) for index, aarecord in enumerate(reciter): thsgi = aarecord.description.split(' ')[0].split('|')[1] thsgene = chromgenes[aadict[thsgi][0]][aadict[thsgi][1]] f_gi = open(gi_fname, 'w') f_gi.write('\n'.join(thsgene.nearls)) f_gi.write('\n') f_gi.close() query_f = open(query_fname, 'w') SeqIO.write(aarecord, query_f, 'fasta') query_f.close() blastp_cline = NcbiblastpCommandline(query=query_fname, db=db_f, evalue=0.1, outfmt=5, out=out_fname, gilist=gi_fname, dbsize=neighbor_num, searchsp=neighbor_num) stdout, stderr = blastp_cline() blast_hsp = SearchIO.read(out_fname, 'blast-xml') thsgene.alignls = [] if blast_hsp.hits != []: for hits_i in blast_hsp: for algns_i in hits_i: if algns_i.bitscore > bitscore_cutoff: thsalign = alignobj() thsalign.query_range = algns_i.query_range thsalign.hit_range = algns_i.hit_range thsalign.evalue = algns_i.evalue thsalign.bitscore = algns_i.bitscore thsalign.hit_gi = (9-len(algns_i.hit_id[3:]))*'0' + algns_i.hit_id[3:] # reject full gene hit to itself if thsalign.hit_gi == thsgene.gi and thsalign.hit_range == thsalign.query_range: continue thsgene.alignls.append(thsalign) # summary statistics if len(thsgene.alignls) > 0: median_hit_len[aadict[thsgi][0]][aadict[thsgi][1]] = (np.median( np.array([thsa.hit_range[1] - thsa.hit_range[0] for thsa in thsgene.alignls]))) hit_num[aadict[thsgi][0]][aadict[thsgi][1]] = len(thsgene.alignls) gi_nums[aadict[thsgi][0]][aadict[thsgi][1]] = thsgene.gi annotation_set = thsgene.description if (annotation_set.find('synth') >= 0 and ((annotation_set.find('polyketide') >= 0 or annotation_set.find('poly-ketide') >= 0) or ((annotation_set.find('nonribosomal') >= 0 or annotation_set.find('non-ribosomal') >= 0) and annotation_set.find('peptide') >= 0))): pks_nrps_yn[aadict[thsgi][0]][aadict[thsgi][1]] = 1. #if index % 200 == 0: # print(suf.split('/')[-2] + ': ' + str(index)) reciter.close() # ********* Identify Clusters ********* clusters = [] clusterft = {'ind': [], 'height': [], 'avg': [], 'medhitlen': [], 'len': [], 'pks_nrps': []} for chromi in range(len(chromgenes)): # 1. filter filter_counts = np.zeros(len(chromgenes[chromi])) for genei in range(len(chromgenes[chromi])): neari = np.where(np.array(chromgenes[chromi][genei].nearls) == chromgenes[chromi][genei].gi)[0][0] filter_counts[genei] = max([min([max([hit_num[chromi][aadict[chromgenes[chromi][genei].nearls[ti]][1]] for ti in range(neari)]+[0.0]), max([hit_num[chromi][aadict[chromgenes[chromi][genei].nearls[ti]][1]] for ti in range(neari+1, len(chromgenes[chromi][genei].nearls))]+[0.0])]), hit_num[chromi][genei]]) # 2. break apart on = False proposed_cluster = [] for nhi in range(len(hit_num[chromi])-1): if on == False and filter_counts[nhi] > edge_thresh: on = True proposed_cluster.append(nhi) if on == True and filter_counts[nhi] < edge_thresh: on = False proposed_cluster.append(nhi) if (proposed_cluster[1] - proposed_cluster[0] > length_thresh and max(filter_counts[proposed_cluster[0]:proposed_cluster[1]]) > peak_thresh): #** adjust for circular clust_rng = [max([proposed_cluster[0]-clust_extend, 0]), min([proposed_cluster[1]+clust_extend, len(chromgenes[chromi])])] clusters.append([geneinfo for geneinfo in chromgenes[chromi][ clust_rng[0]:clust_rng[1]]]) clusterft['ind'].append([chromi, clust_rng]) clusterft['height'].append(max(hit_num[chromi][clust_rng[0]:clust_rng[1]])) clusterft['avg'].append(np.mean(hit_num[chromi][clust_rng[0]:clust_rng[1]])) clusterft['medhitlen'].append(np.median([item for sublist in [[thsa.hit_range[1] - thsa.hit_range[0] for thsa in chromgenes[chromi][ind].alignls] for ind in range(clust_rng[0], clust_rng[1])] for item in sublist])) clusterft['len'].append(clust_rng[1] - clust_rng[0]) if sum(pks_nrps_yn[chromi][clust_rng[0]:clust_rng[1]] < 0.5) == clust_rng[1] - clust_rng[0]: clusterft['pks_nrps'].append(0.) elif sum(pks_nrps_yn[chromi][clust_rng[0]:clust_rng[1]] > 0.5) == clust_rng[1] - clust_rng[0]: clusterft['pks_nrps'].append(1.) else: clusterft['pks_nrps'].append(0.5) proposed_cluster = [] genetable_f = suf + '_genetable_blasted_' + timestr + '.pkl' out_blasted = open(genetable_f, 'wb') pickle.dump(chromgenes, out_blasted) pickle.dump(aadict, out_blasted) pickle.dump(chromnames, out_blasted) out_blasted.close() hit_f = suf + '_blast_summary_' + timestr + '.pkl' out_summary = open(hit_f, 'wb') # alignment length, in units of a.a. pickle.dump(median_hit_len, out_summary) pickle.dump(hit_num, out_summary) pickle.dump(gi_nums, out_summary) pickle.dump(pks_nrps_yn, out_summary) out_summary.close() cluster_f = suf + '_clusters_' + timestr + '.pkl' out_clusters = open(cluster_f, 'wb') pickle.dump(clusters, out_clusters) pickle.dump(clusterft, out_clusters) out_clusters.close() os.system('rm ' + query_fname) os.system('rm ' + gi_fname) os.system('rm ' + out_fname) ## Save summary figure #plt.figure(1, figsize=(10, 6)) #plt.hold(True) #for chromi in range(len(hit_num)): # plt.scatter(hit_num[chromi], median_hit_len[chromi], 40) #plt.xlabel('Number of Local Hits', fontsize=18.0) #plt.ylabel('Median Hit Length (a.a.)', fontsize=18.0) #plt.title(suf.split('/')[-2], fontsize=20.0) #plt.rc('xtick', labelsize=14.) #plt.rc('ytick', labelsize=14.) #plt.savefig(suf + '_scatter_' + timestr + '.png') t1 = datetime.datetime.now() logging.info('finished: ' + suf.split('/')[-2] + '; time: ' + str(t0) + ' to ' + str(t1) + '(' + str(t1-t0) + ')') # output: name, error code, file size, time to run return [suf.split('/')[-2], 1, os.path.getsize(suf + '.fasta'), t1, [suf.split('/')[-2], genetable_f, hit_f, cluster_f]] except: #raise thserror = sys.exc_info() errorstr = 'Error: ' + str(thserror[0]) + ', ' + str(thserror[1]) t1 = datetime.datetime.now() logging.debug('failed on: ' + suf.split('/')[-2] + '; time: ' + str(t0) + ' to ' + str(t1) + '(' + str(t1-t0) + ')') logging.exception('') #raise return [suf.split('/')[-2], errorstr, 0, t1, []]
continue for each in read.cigar: if each[0] == 4 and each[1]/float(read.rlen) >= cutoff: # generating fasta file fa = open('temp.fa','w') print >> fa,'>'+read.qname print >> fa, read.seq fa.close() # run blat code = os.system('gfClient localhost 50000 '+sys.argv[1]+' temp.fa temp.psl >/dev/null 2>&1 ') if code != 0: print 'Execute gfClient failed!' sys.exit(1) try: blat = SearchIO.read('temp.psl','blat-psl') except: break hsps = blat.hsps hsps.sort(key=lambda k:k.score, reverse=True) if hsps[0].hit_id == bwa_bam.getrname(read.tid): # matching genomic coordinate if hsps[0].hit_start == read.pos or hsps[0].hit_end==read.aend: cigarstring, soft_len = psl2sam(hsps[0],blat.seq_len) if soft_len == 0: read.cigarstring, read.pos= cigarstring, hsps[0].hit_start break blat_bam.write(read) bwa_bam.close() blat_bam.close()
from Bio.Alphabet import IUPAC import re import sys #Get a contig sequence given contig name ############################## recs = SeqIO.index(genome, "fasta") def seq_for_contig(contig_name,recs): return recs[contig_name].seq ######################################################################## YR_blast_qresult = SearchIO.read(YR_xml, 'blast-xml') if len(YR_blast_qresult) == 0: raise Exception('No YR were found') #make fragments #print 'Preparing YR fragments' fragment_stores = [] for hit in YR_blast_qresult: hit_sequence = seq_for_contig(hit.id, recs) for hsp in hit.hsps: fragstart = 0 fragend = len(hit_sequence) if hsp.hit_start > extend: fragstart = hsp.hit_start - extend if fragend - hsp.hit_end > extend: fragend = hsp.hit_end + extend
def hits(self): if not self.out_path: raise Exception("You can't access results from HMMER before running the algorithm.") return SearchIO.read(self.out_path, 'hmmer3-tab')
def blat_alignment(mapping, reference, scliplen_cutoff, lowqual_cutoff, min_percent_hq, mapq_cutoff, blat_ident_pct_cutoff, gfServer_port, hetero_factor, input, output): bwa_bam = pysam.Samfile(input, 'rb') blat_bam = pysam.Samfile(output + '.temp.bam', 'wb', template=bwa_bam) if hetero_factor != 'a': denovo = open(output+'.temp.fasta', 'w') putative_indel_cluster = set() for read in bwa_bam.fetch(until_eof=True): if read.is_secondary: # secondary alignment continue if read.is_unmapped: if hetero_factor != 'a': print >> denovo, '>' + read.qname print >> denovo, read.seq continue soft_len, soft_qual, soft_pos = get_softclip_length(read) sclip_ratio = soft_len / float(read.rlen) if soft_pos != -1: sclip_hq_ratio = len(soft_qual[soft_qual >= lowqual_cutoff]) / float(len(soft_qual)) else: sclip_hq_ratio = 0 if sclip_ratio >= scliplen_cutoff and sclip_hq_ratio >= min_percent_hq and read.mapq >= mapq_cutoff: blat_aln = False soft_chr = bwa_bam.getrname(read.rname) if hetero_factor == 'a': blat_aln = True elif (soft_chr, soft_pos) in putative_indel_cluster: blat_aln = True print >> denovo, '>' + read.qname print >> denovo, read.seq if not mapping: blat_bam.write(read) continue # estimate the probability of indels given the coverage and number of soft-clipping readss elif prob_of_indel_with_error(input, soft_chr, soft_pos, hetero_factor) < 0.05: putative_indel_cluster.add((soft_chr, soft_pos)) blat_aln = True print >> denovo, '>' + read.qname print >> denovo, read.seq if not mapping: blat_bam.write(read) continue if blat_aln: fa = open(output+'.temp.fa', 'w') print >> fa, '>' + read.qname print >> fa, read.seq fa.close() try: subprocess.check_call('gfClient localhost ' + gfServer_port +' '+ reference['blat'] + ' ' + output + '.temp.fa ' + output + '.temp.psl >/dev/null 2>&1 ', shell=True) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Execution failed for gfClient:', e sys.exit(1) try: blat = SearchIO.read(output+'.temp.psl', 'blat-psl') print >> sys.stderr, 'Blat aligned read:',read.qname except: print >> sys.stderr, 'No blat hit for read:',read.qname blat_bam.write(read) continue hsps = blat.hsps hsps.sort(key=lambda k: k.score, reverse=True) if hsps[0].ident_pct / 100 >= blat_ident_pct_cutoff and hsps[0].hit_id == bwa_bam.getrname(read.tid): # matching genomic coordinate if hsps[0].hit_start == read.pos or hsps[0].hit_end == read.aend: cigarstring, soft_len = psl2sam(hsps[0], blat.seq_len) if soft_len == 0: read.cigarstring, read.pos = cigarstring, hsps[0].hit_start blat_bam.write(read) bwa_bam.close() blat_bam.close() if hetero_factor != 'a': denovo.close() os.system('samtools sort ' + output + '.temp.bam ' + output + '.temp.sorted') os.system('mv ' + output + '.temp.sorted.bam ' + output) os.system('samtools index ' + output) bwa_bam = pysam.Samfile(input, 'rb') readlen = bwa_bam.next().rlen bwa_bam.close() return readlen
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, os, collections from Bio import SearchIO hmm_file = sys.argv[1] hmm_result = SearchIO.read(hmm_file, 'hmmer3-text') print hmm_result
# - blast XML result # Output: # - STDOUT ##### ## MAIN ##### parser = argparse.ArgumentParser() parser.add_argument('xml', help="Reference genome in fasta") # read stdin if file not provided args = parser.parse_args() blast_qresult = SearchIO.read(args.xml, 'blast-xml') XMLname = splitext(basename(args.xml))[0] nameParts = XMLname.split("_") # INDEL related data chrom = nameParts[0] start = nameParts[1] len_indel = nameParts[4][3:] if (len(blast_qresult)): num_hits = len(blast_qresult) # HIT related data
from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SearchIO humdb="/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast" blastn_cline=NcbiblastnCommandline(query="temp.fasta", db=humdb, gapopen=1, gapextend=2, word_size=9, reward=1, evalue=10, outfmt=5, out="try.xml") stdout, stderr=blastn_cline() bres=SearchIO.read("try.xml", 'blast-xml') SearchIO.write(bres, 'try.tsv', 'blast-tab') ##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF
This script takes quite a long time and often interrupts, due to NCBI not responding in the alloted time. Has to be restarted several times when 350 sequences were analyzed, removing the sequences for which the results were obtained. Run as: python blast_annotation.py proteins.faa > output.tab """ from Bio import SeqIO from Bio import SearchIO from Bio.Blast import NCBIWWW from sys import argv sequences = open(argv[1], 'r') for sequence in SeqIO.parse(sequences, "fasta"): result_handle = NCBIWWW.qblast("blastp", "nr", str(sequence.seq), hitlist_size=10, expect=1e-03) save_file = open("my_blast.xml", "w") save_file.write(result_handle.read()) save_file.close() result_handle.close() blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml') for i in range(0, len(blast_qresult)): blast_hsp = blast_qresult[i][0] evalue = blast_hsp.evalue desc = blast_hsp.hit_description hit_id = blast_hsp.hit_id print sequence.id, '\t', evalue, '\t', hit_id, '\t', desc