# For more information: http://emboss.sourceforge.net/download/ Entrez.email = "*****@*****.**" if __name__ == "__main__": with open('data/data.txt') as dataset: ids = dataset.read().split() handle = Entrez.efetch(db = 'nucleotide', id = ids, rettype = "fasta") records = list(SeqIO.parse(handle, 'fasta')) for i, r in enumerate(records): with open(ids[i], 'w') as f: SeqIO.write(r, f, 'fasta') needle_cline = NeedleCommandline() needle_cline.asequence = ids[0] needle_cline.bsequence = ids[1] needle_cline.outfile = "rosalind_need_output.txt" needle_cline.gapopen = 10 needle_cline.gapextend = 1 needle_cline.endopen = 10 needle_cline.endextend = 1 needle_cline.endweight = True needle_cline() with open('rosalind_need_output.txt') as f: output = f.readlines() for line in output: if 'Score:' in line:
def needle_alignment_emboss(s1, s2): import subprocess from Bio.Emboss.Applications import NeedleCommandline from Bio import AlignIO cline = NeedleCommandline(auto=True, sprotein=True, stdout=True, gapopen=10, gapextend=1) cline.asequence = "asis:" + s1 cline.bsequence = "asis:" + s2 process = subprocess.Popen(str(cline), shell=True, stdout=subprocess.PIPE, universal_newlines=True) return AlignIO.read(process.stdout, "emboss")
def needle_alignment(s1, s2): ''' DESCRIPTION Does a Needleman-Wunsch Alignment of sequence s1 and s2 and returns a Bio.Align.Generic.Alignment object. ''' import subprocess from Bio.Emboss.Applications import NeedleCommandline from Bio import AlignIO cline = NeedleCommandline(auto=True, sprotein=True, stdout=True, gapopen=10, gapextend=1) cline.asequence = "asis:" + s1 cline.bsequence = "asis:" + s2 process = subprocess.Popen(str(cline), shell=True, stdout=subprocess.PIPE) return AlignIO.read(process.stdout, "emboss")
def test_needle_needs_output(self): """Run needle without output file or stdout/filter should give error.""" cline = NeedleCommandline( cmd=exes["needle"], asequence="asis:ACCCGGGCGCGGT", bsequence="asis:ACCCGAGCGCGGT", gapopen=10, gapextend=0.5, auto=True, ) self.assertTrue(cline.auto) self.assertTrue(not cline.stdout) self.assertTrue(not cline.filter) self.assertEqual(cline.outfile, None) self.assertRaises(ValueError, str, cline)
def main(): if len(sys.argv) != 3: print('usage {0:s} genbank_id1 genbank_id2'.format(sys.argv[0])) sys.exit(1) genbank_a = sys.argv[1] genbank_b = sys.argv[2] sequence_a = getFasta(genbank_a) sequence_b = getFasta(genbank_b) needle_cline = NeedleCommandline('needle', asequence=sequence_a, bsequence=sequence_b, gapopen=10, gapextend=1, endweight=True, endopen=10, endextend=1, outfile='{0:s}_{1:s}_needle.txt'.format( genbank_a, genbank_b)) stdout, stderr = needle_cline() sys.exit(0)
def _needle(fa, fb, needlefile, a, b, results): """ Run single needle job """ from Bio.Emboss.Applications import NeedleCommandline needle_cline = NeedleCommandline(asequence=fa, bsequence=fb, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() nh = NeedleHeader(needlefile) FileShredder([fa, fb, needlefile], verbose=False) r = ["\t".join((a, b, nh.identity, nh.score))] results.extend(r)
def needle(args): """ %prog needle pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them. """ from Bio.Emboss.Applications import NeedleCommandline from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.base import FileShredder p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pairsfile, apep, bpep = args afasta = Fasta(apep) bfasta = Fasta(bpep) fp = open(pairsfile) for row in fp: fa = open(pairsfile + "_a.fasta", "w") fb = open(pairsfile + "_b.fasta", "w") a, b = row.split() a = afasta[a] b = bfasta[b] SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = pairsfile + "_ab.needle" needle_cline = NeedleCommandline(asequence=fa.name, bsequence=fb.name, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() print >> sys.stderr, stdout + stderr #align = AlignIO.read(needlefile, "emboss") nh = NeedleHeader(needlefile) print "\t".join((a.id, b.id, nh.identity, nh.score)) FileShredder([fa.name, fb.name, needlefile])
def needle_align(fa1, fa2, gapopen = 10.0, gapextend = 0.5): ''' Uses needle to align two fastas with default gap penalties fa1, fa2: filenames of fastas to pairwise align. Must exist on disk when command is called. gapopen: gap open penalty [default = 10.0] gapextend: gap extend penalty [default = 0.5] Returns a MultipleSeqAlignment object ''' needle_cmd = NeedleCommandline(asequence = fa1, bsequence = fa2, outfile='/dev/stdout', aformat = 'fasta', gapopen = gapopen, gapextend = gapextend ) exaln = AlignIO.read(StringIO(needle_cmd()[0]), format = 'fasta') return exaln
def cluster_seq_support_nw(seq_dict, ident_thresh=0.90): matrix = matlist.blosum62 items = seq_dict.items() ident_matrix = np.identity(len(items)) for ind1 in range(len(items)): (gi1, sr1) = items[ind1] # print ind1,' from ',len(items) for ind2 in range(ind1): (gi2, sr2) = items[ind2] # pairwise2.align.globalds(p53_human, p53_mouse, matrix, gap_open, gap_extend) # alns = pairwise2.align.globalds(sr1.seq, sr2.seq, matrix, -10, -0.5) # alns = pairwise2.align.globalxx(sr1.seq, sr2.seq) needle_cline = NeedleCommandline(asequence="asis::" + sr1.seq, bsequence="asis::" + sr2.seq, gapopen=10, gapextend=0.5, outfile=TEMP_DIR + "/needle.txt") stdout, stderr = needle_cline() align = AlignIO.read(TEMP_DIR + "/needle.txt", "emboss") # print align # l1,l2=alns[0][0:2] l1 = align[0].seq l2 = align[1].seq matches = sum(aa1 == aa2 for aa1, aa2 in zip(l1, l2)) identity = matches / float(len(l1)) # print identity ident_matrix[ind1, ind2] = identity ident_matrix[ind2, ind1] = identity #crude clustering # print ident_matrix support = dict() # print ident_matrix for i in range(len(items)): support[items[i][0]] = 0 for k in range(len(items)): if (ident_matrix[i, k] > ident_thresh): support[items[i][0]] += 1 return support
def call_emboss(emboss_tool, aseq, bseq, outfile): if 'needle' in emboss_tool: # global alignment tool = NeedleCommandline(emboss_tool, asequence=aseq, bsequence=bseq, gapopen=10, gapextend=0.5, outfile=outfile) elif 'water' in emboss_tool: # local alignment tool = WaterCommandline(emboss_tool, asequence=aseq, bsequence=bseq, gapopen=10, gapextend=0.5, outfile=outfile) stdout, stderr = tool() return None
def run_needle(needle_exe,aseq,bseq,outfile): """ Executes the EMBOSS needle program. Args: needle_exe (str): path to EMBOSS needle executable aseq (str): path to first sequence file bseq (str): path to second sequence file outfile (str): path to the output file to generate """ needle_cline = NeedleCommandline( needle_exe, asequence=aseq, bsequence=bseq, gapopen=10, gapextend=0.5, outfile=outfile ) stdout,stderr = needle_cline() return None
def test_needle_piped(self): """Run needle with asis trick, output piped to stdout.""" cline = NeedleCommandline( cmd=exes["needle"], asequence="asis:ACCCGGGCGCGGT", bsequence="asis:ACCCGAGCGCGGT", gapopen=10, gapextend=0.5, auto=True, filter=True, ) self.assertEqual( str(cline), exes["needle"] + " -auto -filter" + " -asequence=asis:ACCCGGGCGCGGT" + " -bsequence=asis:ACCCGAGCGCGGT" + " -gapopen=10 -gapextend=0.5", ) # Run the tool, child = subprocess.Popen( str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32"), ) child.stdin.close() # Check we could read its output align = AlignIO.read(child.stdout, "emboss") self.assertEqual(len(align), 2) self.assertEqual(align[0].seq, "ACCCGGGCGCGGT") self.assertEqual(align[1].seq, "ACCCGAGCGCGGT") # Check no error output: self.assertEqual(child.stderr.read(), "") self.assertEqual(0, child.wait()) child.stdout.close() child.stderr.close()
def align_genbank(self, query, frames): """Aligns the chosen GenBank record to the query sequence with the 'needle' program in EMBOSS.""" # First, we check the frames of the hits from the BLAST record. If # they are different, then we have to RC one of the sequences. We # choose to RC the query, since it is much simpler, and will not # introduce errors. The frames are given as integers with the values # {-3, -2, -1, 1, 2, 3}. If they are the same sign, don't RC, if they # are not, then RC. if (frames[0] > 0 and frames[1] > 0): snp_seq = query.seq else: snp_seq = query.seq.reverse_complement() # Build the 'needle' command needle_cmd = NeedleCommandline(asequence=self.genbank_seq.name, bsequence='asis:' + str(snp_seq), gapopen=self.gapopen, gapextend=self.gapextend, outfile=self.needle_out.name) # Run it! needle_cmd() return
def needle(*id, gop=10, gex=0.5, out='emb.aln'): """Alignement global par la methode de Needleman""" lso = list(SeqIO.parse(workfile, "fasta")) mkfasx('seqa.fas', id[0]) mkfasx('seqb.fas', *id[1:]) needle_cline = NeedleCommandline(asequence='seqa.fas', bsequence='seqb.fas', gapopen=gop, gapextend=gex, outfile=out) stdout, stderr = needle_cline() os.remove('seqa.fas') os.remove('seqb.fas') if len(id) < 3: align = AlignIO.read(out, "emboss") return align
def alignment_filter(seqs, template, gapopen=10, gapextend=0.5, lo_cutoff=300, hi_cutoff=1000, cleanup=True): text_logger = logging.getLogger(__name__+'.text_logger') text_logger.info('Started alignment-based filtering') start_n_seqs = len(seqs) # Save the template and sequences as temporary fasta files # Probably some hacking that can be done in the NeedleCommandline stuff seqs_f_name = 'tempseq.fa' with open(seqs_f_name, 'w') as sh: SeqIO.write(seqs, sh, 'fastq') # Generate alignment command, run the alignment text_logger.info("""Began EMBOSS needle routine with settings:\ngapopen: %i\ngapextend: %i\nlo_cutoff: %i\nhi_cutoff: %i""", gapopen, gapextend, lo_cutoff, hi_cutoff) ofilen = 'temp_'+str(uuid.uuid4())+'.needle' needle_cline = NeedleCommandline(asequence='asis::{}'.format(template), bsequence=seqs_f_name, gapopen=gapopen, gapextend=gapextend, outfile=ofilen) needle_cline() text_logger.info('Finished EMBOSS needle routine') aln_data = AlignIO.parse(open(ofilen), "emboss") new_seqs = cull_alignments(aln_data, lo_cutoff=lo_cutoff, hi_cutoff=hi_cutoff) # Exit routine if cleanup: text_logger.info('Cleaning up temp files') os.remove(seqs_f_name) os.remove(ofilen) text_logger.info("""Finished alignment-based filtering. Kept %i of %i sequences.""", len(new_seqs), start_n_seqs) return new_seqs
def global_align(self, aseq, bseq): """ Perform a global alignment using EMBOSS needle with input string sequences aseq and bseq Creates a file needle.txt for the output TODO: maybe combine with Query.global_align """ with NamedTemporaryFile(mode='w+') as temp: needle_cline = NeedleCommandline(asequence="asis:"+aseq, bsequence="asis:"+bseq, datafile='EBLOSUM62', gapopen=self.needle_gapopen, gapextend=self.needle_gapextend, outfile=temp.name) child = subprocess.Popen(str(needle_cline), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) child.wait() ret = child.returncode if ret == 0: temp.seek(0) needle_res = self.read_needle_out(temp.readlines()) else: print('ERROR: Non-zero return code from needle alignment (generate)') needle_res = ('', '', '') return needle_res
def test_needle_file(self): """needle with the asis trick, output to a file.""" #Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") #EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) #Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) out, err = child.communicate() return_code = child.returncode #Check it worked, errors = err.strip() self.assert_(err.strip().startswith("Needleman-Wunsch global alignment"), errors) self.assertEqual(out.strip(), "") if return_code != 0 : print >> sys.stderr, "\n%s"%cline self.assertEqual(return_code, 0) filename = cline.outfile self.assert_(os.path.isfile(filename)) #Check we can parse the output... align = AlignIO.read(open(filename),"emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") #Clean up, os.remove(filename)
def get_hist_ss(test_seq, type='Unknown', debug=0): """Returns sequence elements in histone sequence, all numbers assume first element in seq has number 0!!! Not like in PDB""" #Let's define 1kx5 sequences templ_H3 = Seq( "ARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEASEAYLVALFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA", IUPAC.protein) templ_H4 = Seq( "SGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG", IUPAC.protein) templ_H2A = Seq( "SGRGKQGGKTRAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPKKTESSKSKSK", IUPAC.protein) templ_H2B = Seq( "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK", IUPAC.protein) #'element_name':[start,stop], start stop - are inclusive as in PDB file #Numbering differes between symmetrical chains and 1kx5 vs 1aoi. #We simply take the minimum length of alpha helices over all chains in 1kx5 #1 substructed from PDB values!!! because these values are in array index numberins starting from 0 #docking domain (amino acids 80 – 119) from paper by Luger 1aoi, however in JMB paper we defined it as 80-118, probably to be at the trypsin cleavage site KK???, so we stick with this here. Although HistoneDB uses the Luger convention (albite with a bug - it starts with 81 - that was fixed in code now). ss_templ_H3 = { 'alphaN': [43, 56], 'alpha1': [62, 76], 'alpha2': [84, 113], 'alpha3': [119, 130], 'loopL1': [78, 83], 'loopL2': [114, 118], 'beta1': [82, 83], 'beta2': [117, 118], 'mgarg1': [62, 62], 'mgarg2': [82, 82], 'mgarg3': [48, 48] } ss_templ_H4 = { 'alpha1ext': [23, 28], 'alpha1': [29, 40], 'alpha2': [48, 75], 'alpha3': [81, 92], 'loopL1': [41, 47], 'loopL2': [76, 81], 'beta1': [44, 45], 'beta2': [79, 80], 'beta3': [95, 97], 'mgarg1': [44, 44] } # ss_templ_H2A={'alpha1ext':[15,21],'alpha1':[25,36],'alpha2':[45,72],'alpha3':[78,88],'alpha3ext':[89,96],'loopL1':[37,44],'loopL2':[73,77],'beta1':[41,42],'beta2':[76,77],'beta3':[99,101],'docking domain':[91,107],'docking tail':[108,116],'mgarg1':[41,41],'mgarg2':[76,76]} #new def of docking domains as in Suto Luger 2000 ss_templ_H2A = { 'alpha1ext': [15, 21], 'alpha1': [25, 36], 'alpha2': [45, 72], 'alpha3': [78, 88], 'alpha3ext': [89, 96], 'loopL1': [37, 44], 'loopL2': [73, 77], 'beta1': [41, 42], 'beta2': [76, 77], 'beta3': [99, 101], 'docking domain': [80, 118], 'mgarg1': [41, 41], 'mgarg2': [76, 76] } ss_templ_H2B = { 'alpha1': [33, 45], 'alpha2': [51, 80], 'alpha3': [86, 98], 'alphaC': [99, 119], 'loopL1': [46, 50], 'loopL2': [81, 85], 'beta1': [49, 50], 'beta2': [84, 85], 'mgarg1': [29, 29] } ss_templ = { 'H3': ss_templ_H3, 'H4': ss_templ_H4, 'H2A': ss_templ_H2A, 'H2B': ss_templ_H2B } templ = { 'H3': templ_H3, 'H4': templ_H4, 'H2A': templ_H2A, 'H2B': templ_H2B } #Lets use blast and see what histone is our query my_records = [ SeqRecord(templ_H3, id='H3', name='H3'), SeqRecord(templ_H4, id='H4', name='H4'), SeqRecord(templ_H2A, id='H2A', name='H2A'), SeqRecord(templ_H2B, id='H2B', name='H2B') ] n1 = str(uuid.uuid4()) n2 = str(uuid.uuid4()) faa_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".faa") fastan2_file = os.path.join(CONFIG.TEMP_DIR, n2 + ".fasta") fastan1_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".fasta") db_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db") xml_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".xml") txt_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".txt") phr_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.phr") pin_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.pin") psq_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.psq") SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')], fastan2_file, 'fasta') # print(os.environ.get('PATH')) if (type == 'Unknown'): SeqIO.write(my_records, faa_file, "fasta") os.system('makeblastdb -dbtype prot -in %s -out %s > /dev/null' % (faa_file, db_file)) blastp_cline = NcbiblastpCommandline(query=fastan2_file, db=db_file, evalue=100, outfmt=5, out=xml_file) stdout, stderr = blastp_cline() blast_record = NCBIXML.read(open(xml_file, 'r')) sname = list() evalue = list() hsp_list = list() # length_list=list() for alignment in blast_record.alignments: for hsp in alignment.hsps: sname.append(alignment.title) evalue.append(hsp.expect) hsp_list.append(hsp) # length_list.append(alignment.length) hist_identified = sname[evalue.index(min(evalue))].split()[1] hsp = hsp_list[evalue.index(min(evalue))] # length=length_list[evalue.index(min(evalue))] else: hist_identified = type if (debug): print('Most likely this is histone:') if (debug): print(hist_identified) if (debug): print('Blast alignment') #We need to determine secondary strucutre according to template using the alignment # if(debug): print(hsp) SeqIO.write([ SeqRecord( templ[hist_identified], id=hist_identified, name=hist_identified) ], fastan1_file, 'fasta') #Now we will redo it with Needlman Wunsh - the global alignment needle_cline = NeedleCommandline(asequence=fastan1_file, bsequence=fastan2_file, gapopen=20, gapextend=1, outfile=txt_file) stdout, stderr = needle_cline() # print('Needle alignment') align = AlignIO.read(txt_file, "emboss") if (debug): print(align) # print(hsp.gaps) #Blast checking # ss_test=dict() # for key,value in ss_templ[hist_identified].iteritems(): # print('Checking %s'%key) # if((hsp.sbjct_start<=value[1])&((hsp.sbjct_end)>=value[0])): # print('Belongs') # else: # print('Not') #Now we will get correspondence ss_test = dict() hist = templ[hist_identified] corrsp_hist = list(range(len(hist))) k = 0 for a, i in zip(align[0], range(len(align[0]))): if (a == '-'): k = k + 1 else: corrsp_hist[i - k] = i if (debug): print(corrsp_hist) corrsp_test = list(range(len(test_seq))) k = 0 for a, i in zip(align[1], range(len(align[1]))): if (a == '-'): k = k + 1 else: corrsp_test[i - k] = i if (debug): print(corrsp_test) for key, value in ss_templ[hist_identified].items(): if (debug): print('Checking %s' % key) start_in_aln = corrsp_hist[value[0]] if (debug): print('Start in aln %d' % start_in_aln) end_in_aln = corrsp_hist[value[1]] if (debug): print('End in aln %d' % end_in_aln) for k in range(len(align[0])): try: start_in_test_seq = corrsp_test.index(start_in_aln + k) except: start_in_test_seq = -1 if (debug): print("Trying to move start"), continue break # print('\n %d'%start_in_test_seq) for k in range(len(align[0])): try: end_in_test_seq = corrsp_test.index(end_in_aln - k) except: end_in_test_seq = -1 if (debug): print('Trying to move end'), continue break # print('\n %d'%end_in_test_seq) if ((start_in_test_seq == -1) | (end_in_test_seq == -1) | (start_in_test_seq > end_in_test_seq)): ss_test[key] = [-1, -1] else: ss_test[key] = [start_in_test_seq, end_in_test_seq] if (debug): print(ss_test[key]) if (type == 'Unknown'): #os.system("rm %s.faa %s.db.phr %s.db.pin %s.db.psq %s.fasta %s.xml %s.txt %s.fasta"%(n1,n1,n1,n1,n2,n1,n1,n1)) os.system("rm %s %s %s %s %s %s %s %s"%\ (faa_file,phr_file,pin_file,psq_file,fastan2_file,xml_file,txt_file,fastan1_file)) else: os.system("rm %s %s %s" % (fastan2_file, txt_file, fastan1_file)) return hist_identified, ss_test
def alignedvariants(self, threshold=0.9): import subprocess import re import itertools import hashlib from Bio.Emboss.Applications import NeedleCommandline from pythonlib import Alignment files = [] var_dict = {} for i, s in enumerate(self.seq_obj): m_obj = re.search('posterior=(.*)\s*ave_reads=(.*)', s.description) post, ave_reads = map(float, (m_obj.group(1), m_obj.group(2))) if post < threshold or ave_reads < 1.: continue if post > 1.0: print('WARNING: posterior=', post, file=sys.stderr) outfile = 'tmp%d.needle' % i files.append(outfile) needle_cline = NeedleCommandline(asequence='asis:%s' % self.ref, bsequence='asis:%s' % s.seq.tostring().strip('-'), \ outfile=outfile, gapopen=10.0, gapextend=0.5, aformat='markx10') needle_cline.auto = True try: retcode = subprocess.call(str(needle_cline), shell=True) if retcode < 0: sys.exit('Child needle was terminated by signal %d' % -retcode) # else: # print >> sys.stderr, 'Child needle returned %i' % retcode except OSError: sys.exit('Execution of needle failed: %s' % ee) pass tal = Alignment.alignfile2dict([outfile], 'support_seqs%d' % i, 10.0, 0.5, Verbose=False) os.remove(outfile) ka = tal.keys()[0] this = tal[ka]['asis'] it_pair = itertools.izip(this.seq_a, this.seq_b) #this.summary() #start, stop = this.start, this.stop #it_pair = itertools.izip(this.seq_a[start-1:stop], this.seq_b[start-1:stop]) this_seq = [] while True: try: p = it_pair.next() except StopIteration: break if p is None: break if p[1] == '-': assert p[0] != '-', 'gap-gap?' this_seq.append(p[0]) elif p[0] != '-': this_seq.append(p[1]) ws = ''.join(this_seq) var_dict[ws] = var_dict.get(ws, 0) + ave_reads for k, v in var_dict.items(): ts = Seq(k, IUPAC.unambiguous_dna) tsr = SeqRecord(ts, id = hashlib.sha224(k).hexdigest(), \ name='Reconstructed local hap') tsr.description = 'ave_reads=%f' % v self.dna_seqs.append(tsr) print('%d haplotypes have support >=%f'\ % (len(files), threshold), file=sys.stderr) return self.dna_seqs
def GetExec(self, optList, frame): # Respond to the "embossn" type command. self.frame = frame plugin_exe = r"C:/mEMBOSS/needle.exe" self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt" self.outtype = "fasta" cline = NeedleCommandline( plugin_exe, asequence=str(self.frame.paramBoxes[1].GetValue()), bsequence=str(self.frame.paramBoxes[3].GetValue())) cline.outfile = self.outfile cline.gapopen = self.param[7].GetValue() cline.gapextend = self.param[9].GetValue() if self.param[10].GetValue(): cline.similarity = True else: cline.similarity = False if self.frame.abet == "AA": cline.snucleotide = False cline.sprotein = True elif self.frame.abet == "DNA" or self.frame.abet == "RNA": cline.snucleotide = True cline.sprotein = False if self.frame.options: t = self.boxList[3].GetValue() if t != '': cline.datafile = str(t) return str(cline)
def binomial(self): patt = re.compile(r'_') pattern = re.compile('\#\sIdentity\:\s+[0-9\/]+\s+\(([0-9\.]+)\%\)') patt1 = re.compile('\#\sIdentity\:\s+(\d+)\/(\d+)\s+\([0-9\.]+\%\)') patt2 = re.compile('\#\sGaps\:\s+(\d+)\/\d+') out = open("reversa_result.txt", "w") results = [] #print self.bestWin for j, i in self.bestWin.items(): out.write('cluster %s \n' % j) for a, b in itertools.combinations(i, 2): seq1 = patt.split(a)[0] seq2 = patt.split(b)[0] #print seq1, seq2 if seq1 != seq2: for fasta in SeqIO.parse(self.fastaFile, "fasta"): if fasta.id == seq1: aseq = str(fasta.seq) if fasta.id == seq2: bseq = str(fasta.seq) needle_cline = NeedleCommandline( asequence="asis:%s" % str(aseq), bsequence="asis:%s" % str(bseq), gapopen=10, gapextend=0.5, outfile="needle.txt") needle_cline() for line in open('needle.txt'): ident = pattern.search(line) if ident is not None: identity = ident.group(1) for seq in SeqIO.parse("windows_sequence.fasta", "fasta"): if seq.id == a: awin = str(seq.seq) if seq.id == b: bwin = str(seq.seq) ncline = NeedleCommandline(asequence="asis:%s" % str(awin), bsequence="asis:%s" % str(bwin), gapopen=10, gapextend=0.5, outfile="needle1.txt") ncline() for line1 in open('needle1.txt'): #print line1 identwin = patt1.search(line1) if identwin is not None: iwin = identwin.group(1) ilen = identwin.group(2) gapPatt = patt2.search(line1) #print gapPatt if gapPatt is not None: gap = gapPatt.group(1) difWin = int(ilen) - (int(iwin) + int(gap)) size = self.sizeWin totalDif = 1 - (float(identity) / 100) pvalue = stats.binom.cdf(difWin, size, totalDif) results.append({'seq1': a, 'seq2': b, 'pvalue': pvalue}) if pvalue <= 1.00e-20: #print 'combination:', a, b, 'pvalue: ', pvalue out.write('combination: %s %s pvalue: %s \n' % (a, b, pvalue)) out.close() dfResults = pd.DataFrame(results) dfResults.to_csv('results.csv')
a[i:j].replace('-', ''), b[i:j].replace('-', '')])) # too slow ### use linux server with EMBOSS pairwise alignment programs installed from Bio import SeqIO from Bio.Emboss.Applications import NeedleCommandline from Bio import AlignIO seqfiles = [] for i in SeqIO.parse('rosalind_laff.txt', 'fasta'): seq_file = i.id + '.txt' SeqIO.write(i, seq_file, 'fasta') seqfiles.append(seq_file) needle_cline = NeedleCommandline(asequence=seqfiles[0], bsequence=seqfiles[1], gapopen=11, gapextend=1, outfile="needle.txt") needle_cline() aln = AlignIO.read('needle.txt', "emboss") a, b = [str(i.seq) for i in aln] # need to find the score in needle output file 'needle.txt' for ln in open('needle.txt'): if 'Score' in ln: print(ln) score = int(float(ln.rstrip().split()[-1])) break open('rosalind_laff_sub.txt', 'wt').write('\n'.join([str(int(score)), a.replace('-', ''), b.replace('-', '')]))
def test_needle_file(self): """needle with the asis trick, output to a file.""" # Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") # EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, stdout, stderr = cline() # Check it worked, self.assertTrue(stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr) self.assertEqual(stdout.strip(), "") filename = cline.outfile self.assertTrue(os.path.isfile(filename), "Missing output file %r from:\n%s" % (filename, cline)) # Check we can parse the output... align = AlignIO.read(filename, "emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") # Clean up, os.remove(filename)
print "continue" continue else: print "good length = %i" % seqlength print "break" break newrefrec = temprec SeqIO.write(newrefrec, refoutfile, "fasta") x=0 for item in protlist: x = x + len(item.seq) avg = x/len(protlist) print avg needle_cline = NeedleCommandline() needle_cline.asequence=refoutfile needle_cline.bsequence=prot_outfile needle_cline.gapopen=10 needle_cline.gapextend=0.5 needle_cline.outfile=alignment_out print needle_cline #stdout, stderr = needle_cline() #logstring = stdout+stderr #logout = open(logfile, "w") #logout.write(logstring) #logout.close()
def renumber_noInputAlign(pdbfile,refseqfile,selection="protein",\ outfile="renumbered.pdb",newAA=None,first=1): ''' Renumber pdb file (pdbfile) according to reference sequence in refseqfile. Pdb sequence is extracted and aligned with reference sequence using needle from EMBOSS. - refseqfile: .fasta file containing the reference sequence by which to renumber - selection: atom selection(s) in the the structure file to renumber. Will iterate over comma separated selections to renumber each. - pdbfile: original structure file - outfile: output structure file - newAA: comma separated list of unrepresented amino acids XXXYCA: XXX = three letter abbrevation as in pdbfile Y = one letter code in the alignment CA = atom to use as CA if different from "CA", eg C1 in PVL of 1JEN ''' # selections = selection.split(",") selections = selection tmp=tempfile.gettempdir() tmp_refseqfile="%s/refseq.fasta"%tmp pdbID = re.search("\w+\.\w+", pdbfile).group(0) tmp_pdbseqfile="%s/%s.fasta"%(tmp,pdbID) tmp_needle="%s/needle.out"%tmp if os.path.exists(refseqfile): refseqRec = SeqIO.read(refseqfile,"fasta",alphabet=IUPAC.protein ) refseqRec.id = "refseq" SeqIO.write(refseqRec,tmp_refseqfile,"fasta") else: print ("ERROR, no such file: %s"%refseqfile) exit(1) if os.path.exists(pdbfile): structure=parsePDB("%s"%pdbfile) updateAA(structure,newAA) else: print ("ERROR, no such file: %s"%pdbfile) exit(1) modified_selections = [] for polymer in selections: currentSel = structure.select("protein and name CA and %s"%polymer) if currentSel: pdbseq_str=''.join([oneletter[i] for i in currentSel.getResnames()]) pdbseqRec=SeqRecord(Seq(pdbseq_str,IUPAC.protein),id=pdbID) SeqIO.write(pdbseqRec,tmp_pdbseqfile,"fasta") needle_cli = NeedleCommandline(asequence=tmp_pdbseqfile,bsequence=tmp_refseqfile,\ gapopen=10,gapextend=0.5,outfile=tmp_needle) needle_cli() aln = AlignIO.read(tmp_needle, "emboss",alphabet=IUPAC.protein ) # os.remove(tmp_needle) # os.remove(tmp_pdbseqfile) gpdb.renumber_aln(aln,"refseq",pdbID,first) pdbRenSeq = gpdb.seqbyname(aln, pdbID) gpdb.renumber_struct(structure, pdbRenSeq,polymer) pdbRenSeq.annotations["resnum"]=str(pdbRenSeq.letter_annotations["resnum"]) modified_selections.append(polymer) # seems to be the only way to store pret residue annotations # AlignIO.write(aln,"pdb.outseq","seqxml") else: print ('ERROR: Selection \"%s\" has zero CA atoms'%polymer) if writePDB(outfile, structure): print ("Wrote renumbered %s selections from %s to %s"%\ (str(modified_selections),pdbfile,outfile)) os.remove(tmp_refseqfile)
for i in records: id = i.id with open(f"{id}.fasta", "w") as output_handle: SeqIO.write(i, output_handle, "fasta") files_created.append(f"{id}.fasta") # pairwise alignment output_file = "_".join(search) output_file = output_file[1:] output_file = output_file + ".txt" pairwise = NeedleCommandline( asequence=f"{files_created[0]}", bsequence=f"{files_created[1]}", gapopen=10, gapextend=1, endopen=10, endextend=1, endweight=True, # emboss.sourceforge.net/apps/cvs/emboss/apps/needle.html outfile=output_file) subprocess.run([str(pairwise)], shell=True, check=True) with open(output_file, "r") as f: for line in f.readlines(): if "Score" in line: print(line)
def GetExec(inF, outF): # Create User Modifiable search check boxes. plugin_exe = r"C:/mEMBOSS/needle.exe" cline = NeedleCommandline(plugin_exe, infile=inF, outfile=outF) p = subprocess.Popen(str(self.cline)) p.wait()
def each_needle_run(pair_gene_dir, tmp_gene_converted_dir, pair_gene_alignment_dir, og_id, strain_dict): """ This function is used to call Needle program to do pairwise sequence alignment :param pair_gene_dir: each homologous gene directory :param tmp_gene_converted_dir: used to put some temporary files and will be deleted in the end :param pair_gene_alignment_dir: each orthologous gene pair-wised alignment directory :param og_id: each orthologous gene id :param strain_dict: inherit from load_strains_label function with strain information :return: the alignment result of each gene """ if not os.path.exists(pair_gene_dir): logger.error("There is no directory contains gene file, please check.") logger.error(last_exception()) sys.exit(1) tmp_gene_fasta = os.path.join(pair_gene_dir, og_id + '.fasta') converted_records = [] re_pattern = re.compile(r'fig\|(\d+\.\d+)\.peg\.(\d+)\s(.*)') in_pattern = re.compile(r'Identity.*\((\d+\.\d+)%\)') annotation = '' og_list = [] for record in SeqIO.parse(tmp_gene_fasta, 'fasta'): m = re.search(re_pattern, record.description) strain_id = m.group(1) gene_id = '{0}.peg.{1}'.format(strain_id, m.group(2)) og_list.append(gene_id) annotation = m.group(3) record.id = strain_dict[strain_id][0] final_record = SeqRecord(record.seq, record.id, description='') converted_records.append(final_record) the_strain_fasta = os.path.join(tmp_gene_converted_dir, 'a.fasta') other_strain_fasta = os.path.join(tmp_gene_converted_dir, 'b.fasta') SeqIO.write(converted_records[0], the_strain_fasta, 'fasta') SeqIO.write(converted_records[1], other_strain_fasta, 'fasta') result_file = os.path.join(pair_gene_alignment_dir, "{0}.txt".format(og_id)) needle_cline = NeedleCommandline() needle_cline.asequence = the_strain_fasta needle_cline.bsequence = other_strain_fasta needle_cline.gapopen = 10 needle_cline.gapextend = 0.5 needle_cline.outfile = result_file devnull = open(os.devnull, 'w') try: subprocess.call(str(needle_cline), shell=True, stdout=devnull, stderr=devnull) except OSError: logger.info( 'Try to call Needle program failed, please check if Needle has been installed successfully.' ) logger.error(last_exception()) sys.exit(1) os.remove(the_strain_fasta) os.remove(other_strain_fasta) gene_alignment_result = '' with open(result_file, 'r') as f: for a_line in f.readlines(): if 'Identity' in a_line: m = re.search(in_pattern, a_line.strip()) identity = m.group(1) gene_alignment_result = '{0}\t[{1}|{2}]\t{3}\t{4}\n'.format( og_id, og_list[0], og_list[1], identity, annotation) return gene_alignment_result
def f2(): proteins=['NSP1', 'NSP2', 'NSP3','NSP4', 'NSP5', 'NSP6', 'NSP7', 'NSP8', 'NSP9', 'NSP10', 'NSP11', 'NSP12', 'NSP13', 'NSP14', 'NSP15', 'NSP16', 'Spike', 'NS3', 'E', 'M', 'NS6', 'NS7a', 'NS7b', 'NS8', 'N'] for each in proteins: needle_cline = NeedleCommandline(asequence=f'{each}referance.fasta', bsequence=f'{each}.fasta', gapopen=10, gapextend=0.5, datafile='EPAM40', outfile=f'{each}needlePAM40.txt', aformat='score', nobrief=True) stdout, stderr = needle_cline() print(stdout + stderr)
len_seq = len(read.seq) length += float(len_seq) length2 += float(len_seq * len_seq) # readdict[read.] = [seq,len_seq] n += 1. meanlr = length / n stdlr = math.sqrt((n * length2 - length * length) / (n * n - n)) allowed_length = [meanlr - acclength * stdlr, meanlr + (1 + acclength) * stdlr] print >> sys.stderr, 'Allowed interval for length is', allowed_length if not os.path.isfile('tmp_align_f.needle'): print >> sys.stderr, 'Aligning back...' cmline_forw = NeedleCommandline(asequence=options.ref, bsequence=f_fasta_forward_filename, outfile='tmp_align_f.needle', gapopen=6.0, gapextend=3.0, aformat='markx10') child_process_forw = subprocess.call(str(cmline_forw), shell=True) if not os.path.isfile('tmp_align_r.needle'): print >> sys.stderr, '...and forth' cmline_rev = NeedleCommandline(asequence=options.ref, bsequence=f_fasta_reverse_filename, outfile='tmp_align_r.needle', gapopen=6.0, gapextend=3.0, aformat='markx10') child_process_rev = subprocess.call(str(cmline_rev), shell=True) diff_ident = []
def getIdentity(self): self.getFiles() needleOut = self.workPath + 'needle.txt' dfResult = 'orthologs_probabilities.csv' patt = re.compile(r'.*\|(.*)\s+.*\|(.*)\s+\d+') pattFile = re.compile(r'\.') identPatt = re.compile(r'.*Identity\:\s+(\d+)\/(\d+)') hostSeq = SeqIO.to_dict(SeqIO.parse(self.queryProt, "fasta")) hostCds = len(hostSeq) #print hostCds with open(dfResult, 'w') as handle: handle.write('name\torthologs\tmean\tstd\tPr_orthologs\n') for i in range(0, len(self.listFiles)): #Define variable qSeq = os.path.join(self.protFolder, self.listFiles[i]) queryGen = SeqIO.to_dict(SeqIO.parse(qSeq, "fasta")) name = str(pattFile.split(self.listFiles[i])[0]) pairsFile = self.workPath + name + '.txt' queryID = [] hostID = [] ident = [] for line in open(pairsFile, 'r'): objM = patt.match(line) if objM.group(1) in hostSeq.keys(): hostProt = objM.group(1) qProt = objM.group(2) else: hostProt = objM.group(2) qProt = objM.group(1) needle_cline = NeedleCommandline( asequence='asis:%s' % str(hostSeq[hostProt].seq), bsequence='asis:%s' % str(queryGen[qProt].seq), gapopen=10, gapextend=0.5, outfile=needleOut) needle_cline() for liFile in open(needleOut, 'r'): objIdent = identPatt.match(liFile) if objIdent is not None: scoreIdent = float(objIdent.group(1)) / float( objIdent.group(2)) #print scoreIdent hostID.append(hostProt) queryID.append(qProt) ident.append(scoreIdent) #print len(ident), len(coliID), len(queryID) #Identities of each organism #ortList = [('queryProteome_ID',hostID),('%s_ID' %name,queryID),('Identity',ident)] orthoTotal = len(ident) #print orthoTotal #orthoData = pd.DataFrame.from_items(ortList) #orthoData.to_csv('ortho_%s.csv' %name, index=False) a = np.array(ident) meanQuery = np.mean(a) stdQuery = np.std(a) #print hostCds orthoPr = float(orthoTotal) / hostCds handle.write('%s\t%s\t%s\t%s\t%f\n' % (name, orthoTotal, meanQuery, stdQuery, orthoPr)) #print '%s\t%s\t%s\t%s\t%f\n' %(name, orthoTotal, meanQuery, stdQuery, orthoPr) os.remove(needleOut) return self.listFiles
def global_align(seq_record1, seq_record2): """Global alignment using the Bio.pairwise2 package. Check if sequences are nucleotide or amino acids using the _verify_alphabet function from the Bio.Alphabet module. """ from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.Alphabet import _verify_alphabet #gap_open = -10 #gap_extend = -0.5 seq_record1.seq = seq_record1.seq.upper() seq_record2.seq = seq_record2.seq.upper() seq1_file = NamedTemporaryFile() SeqIO.write(seq_record1, seq1_file, "fasta") seq1_file.flush() seq2_file = NamedTemporaryFile() SeqIO.write(seq_record2, seq2_file, "fasta") seq2_file.flush() seq_record1.seq.alphabet = IUPAC.ambiguous_dna seq_record2.seq.alphabet = IUPAC.ambiguous_dna if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq): #print "DNA!" # alns = pairwise2.align.globalds(seq1, seq2, DNA_matrix, gap_open, gap_extend) # print ">"+noms[id_seq1] # print alns[0][0] # print ">"+noms[id_seq2] # print alns[0][1] # return alns[0] needle_cline = NeedleCommandline(asequence=seq1_file.name, bsequence=seq2_file.name, stdout=True, gapopen=10, gapextend=0.5, auto=True, aformat="srspair") stdout, stderr = needle_cline() #print stdout align = AlignIO.read(StringIO.StringIO(stdout), "emboss") return align seq_record1.seq.alphabet = IUPAC.protein seq_record2.seq.alphabet = IUPAC.protein #print seq1 #print _verify_alphabet(seq1) if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq): #print "AA!" # alns = pairwise2.align.globalds(seq1, seq2, matlist.blosum62, gap_open, gap_extend) # return alns[0] needle_cline = NeedleCommandline(asequence=seq1_file.name, bsequence=seq2_file.name, stdout=True, gapopen=10, gapextend=0.5, auto=True, aformat="srspair") stdout, stderr = needle_cline() align = AlignIO.read(StringIO.StringIO(stdout), "emboss") return align else: raise "unkown alphabet!"
from Bio import SeqIO, Entrez Entrez.email = "*****@*****.**" if __name__ == "__main__": with open(os.path.join('data', 'rosalind_need.txt')) as dataset: ids = dataset.read().split() handle = Entrez.efetch(db='nucleotide', id=ids, rettype="fasta") records = list(SeqIO.parse(handle, 'fasta')) for i, r in enumerate(records): with open(ids[i], 'w') as f: SeqIO.write(r, f, 'fasta') needle_cline = NeedleCommandline() needle_cline.asequence = ids[0] needle_cline.bsequence = ids[1] needle_cline.outfile = "need.txt" needle_cline.gapopen = 11 needle_cline.gapextend = 1 needle_cline.endopen = 11 needle_cline.endextend = 1 needle_cline.endweight = True needle_cline() with open('need.txt') as f: output = f.readlines() for line in output:
def transfer_features_from_template_to_query(template_features, query_file, save_dir="", save_not_found=False): """Transfer features from template to query. Position are defined in the template and we use needle to find the corresponding position in the template Parameters: ----------- template_features: QuerySet of Feature django models The features that relate to the template. query_file: str Path to FASTA file containing query sequence save_dir: str Path to save temp files. save_not_found: bool Add Features even if they weren't found. Indices will be (-1, -1) Yeilds: ------- A Feature django model with the name of the feature and position relative to the query """ if len(template_features) == 0: return n2 = str(uuid.uuid4()) template = template_features.first().template template_file = template.path() needle_results = os.path.join(save_dir, "needle_{}.txt".format(n2)) cmd = os.path.join(os.path.dirname(sys.executable), "needle") if not os.path.isfile(cmd): cmd = "needle" needle_cline = NeedleCommandline(cmd=cmd, asequence=template_file, bsequence=query_file, gapopen=10, gapextend=1, outfile=needle_results) stdout, stderr = needle_cline() align = AlignIO.read(needle_results, "emboss") # print align.format("fasta") core_histone = align[0] query = align[1] corresponding_hist = list(range(len(template.get_sequence()))) k = 0 for i, core_histone_postion in enumerate(core_histone): if core_histone_postion == "-": k += 1 else: corresponding_hist[i - k] = i corresponding_test = list( range(len(next(SeqIO.parse(query_file, "fasta"))))) k = 0 for i, query_position in enumerate(query): if query_position == "-": k = k + 1 else: corresponding_test[i - k] = i for feature in template_features: start = feature.start stop = feature.end start_in_aln = corresponding_hist[start] end_in_aln = corresponding_hist[stop] start_in_test_seq = -1 end_in_test_seq = -1 for k in range(len(core_histone)): try: start_in_test_seq = corresponding_test.index(start_in_aln + k) break except ValueError: continue for k in range(len(core_histone)): try: end_in_test_seq = corresponding_test.index(end_in_aln - k) break except ValueError: continue if start_in_test_seq == -1 or end_in_test_seq == -1 or start_in_test_seq > end_in_test_seq: if save_not_found: yield Feature( id="{}_{}".format( os.path.splitext(query_file)[0], feature.id), name=feature.name, description=feature.description, start=-1, end=-1, color=feature.color, ) else: yield Feature( id="{}_{}".format(os.path.splitext(query_file)[0], feature.id), name=feature.name, description=feature.description, start=start_in_test_seq, end=end_in_test_seq, color=feature.color, ) #Cleanup os.remove(needle_results)
def test_needle_file(self): """needle with the asis trick, output to a file.""" #Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") #EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) #Run the tool, result, out, err = generic_run(cline) #Check it worked, errors = err.read().strip() self.assert_(errors.startswith("Needleman-Wunsch global alignment"), errors) self.assertEqual(out.read().strip(), "") if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline self.assertEqual(result.return_code, 0) filename = result.get_result("outfile") self.assertEqual(filename, "Emboss/temp with space.needle") assert os.path.isfile(filename) #Check we can parse the output... align = AlignIO.read(open(filename),"emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") #Clean up, os.remove(filename)
def test_needle_file(self): """needle with the asis trick, output to a file.""" #Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") #EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) #Run the tool, result, out, err = generic_run(cline) #Check it worked, errors = err.read().strip() self.assert_(errors.startswith("Needleman-Wunsch global alignment"), errors) self.assertEqual(out.read().strip(), "") if result.return_code != 0: print >> sys.stderr, "\n%s" % cline self.assertEqual(result.return_code, 0) filename = result.get_result("outfile") self.assertEqual(filename, "Emboss/temp with space.needle") assert os.path.isfile(filename) #Check we can parse the output... align = AlignIO.read(open(filename), "emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") #Clean up, os.remove(filename)
) # Output the input sequence restructured as Dunlop reference else: print(sequence.id + " lacks the origin of replication") continue # NCCR BLAST block = NcbiblastnCommandline(query="target_sequence", subject="source/NCCR_BKTyper.fasta", outfmt=6, word_size=12, perc_identity=75, evalue=0.05)()[0] ### # VP1 Needleman and Wunch a = NeedleCommandline(asequence="target_sequence", \ bsequence="source/VP1_Dunlop.fasta", \ gapopen=10, \ gapextend=0.5, \ outfile="needle_fname") a() # execute the alignment # Export the alignment back to Python VP1_alignment = AlignIO.read("needle_fname", "emboss") # Call functions based on mode NCCR = NCCR_complex = subgroup = subgroup_detail = 'NA' # definition of table objects motif_list = (open("source/motif_list.txt", "r")) vp1_db_file = (open("source/VP1_BKTyper_MLtree_list.fasta", "r")) vp1_db = vp1_db_file.read() if sys.argv[2] == 'VP1': (subgroup, subgroup_detail) = VP1_classification(VP1_alignment)
def test_needle_file(self): """Run needle with the asis trick, output to a file.""" # Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") # EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, stdout, stderr = cline() # Check it worked, self.assertTrue( stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr) self.assertEqual(stdout.strip(), "") filename = cline.outfile self.assertTrue( os.path.isfile(filename), "Missing output file %r from:\n%s" % (filename, cline), ) # Check we can parse the output... align = AlignIO.read(filename, "emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") # Clean up, os.remove(filename)
def GetExec(self, optList, frame): # Respond to the "embossn" type command. self.frame = frame plugin_exe = r"C:/mEMBOSS/needle.exe" self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt" self.outtype = "fasta" cline = NeedleCommandline(plugin_exe, asequence=str(self.frame.paramBoxes[1].GetValue()), bsequence=str(self.frame.paramBoxes[3].GetValue())) cline.outfile = self.outfile cline.gapopen = self.param[7].GetValue() cline.gapextend = self.param[9].GetValue() if self.param[10].GetValue(): cline.similarity = True else: cline.similarity = False if self.frame.abet=="AA": cline.snucleotide = False cline.sprotein = True elif self.frame.abet=="DNA" or self.frame.abet=="RNA": cline.snucleotide = True cline.sprotein = False if self.frame.options: t = self.boxList[3].GetValue() if t != '': cline.datafile = str(t) return str(cline)