def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_alignadd(self): """testing adding one alignment to another.""" align1= LoadSeqs(data={'a': 'AAAA', 'b': 'TTTT', 'c': 'CCCC'}) align2 = LoadSeqs(data={'a': 'GGGG', 'b': '----', 'c': 'NNNN'}) align = align1 + align2 concatdict = align.todict() self.assertEqual(concatdict, {'a': 'AAAAGGGG', 'b': 'TTTT----', 'c': 'CCCCNNNN'})
def get_paralinear_distances(gene, data_directory=None, third_position=False, **kw): filenames = glob.glob(os.path.join(data_directory, gene+'.fasta*')) assert len(filenames) == 1, 'Wrong number of alignment files for ' + gene filename = filenames[0] if filename.endswith('.fasta'): with open(filename) as fastafile: fastadata = fastafile.read() elif filename.endswith('.fasta.gz'): with GzipFile(filename) as fastafile: fastadata = fastafile.read() else: raise RuntimeError(gene + ' file could not be read') sequences = LoadSeqs(data=fastadata) if third_position: indices = [(i, i+1) for i in range(len(sequences))[2::3]] pos3 = sequences.addFeature('pos3', 'pos3', indices) sequences = pos3.getSlice() sequences = sequences.filtered(lambda x: set(''.join(x)) <= set(DNA)) paralinear_calc = ParalinearPair(moltype=DNA, alignment=sequences) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() return {frozenset(k):v for k, v in dists.items()}
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}): """ Checks parameters for pairwise alignment, returns alignment. Code from Greg Caporaso. """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError( "Pairwise aligning of seqs requires exactly two seqs.") try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict( match=1, transition=-1, transversion=-1) return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def rooted(doc, rooted_edges=None, gc=None, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) sp_kw = dict(upper=20., lower=0.05, is_independent=False) sm = MG94GTR(optimise_motif_probs=True) init_lf = sm.makeLikelihoodFunction(tree) init_lf.setAlignment(aln) with init_lf.updatesPostponed(): for param in init_lf.getParamNames(): if '/' in param: init_lf.setParamRule(param, **sp_kw) init_lf.setParamRule('length', edges=rooted_edges, is_independent=False) init_lf.optimise(local=True, show_progress=False, limit_action='raise') init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False) sm = GNC(optimise_motif_probs=True) lf = sm.makeLikelihoodFunction(tree) lf.setAlignment(aln) _populate_parameters(lf, init_lf, **sp_kw) for param in lf.getParamNames(): if '>' in param or param == 'omega': lf.setParamRule(param, edges=rooted_edges, is_independent=False) lf.optimise(local=True, show_progress=False, limit_action='raise') flat_lf = nest.deflate_likelihood_function(lf) flat_lf['hard_up'] = _is_hard_up(lf) return {'lf': flat_lf, 'gc': code.Name, 'rooted_edges': rooted_edges}
def ml(doc, model='NG', gc=None, omega_indep=True, model_gaps=False, indel_indep=True, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) if model != 'NG': # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) if model_gaps: filt = lambda x: set(''.join(x)) <= set(DNA).union({'-'}) else: filt = lambda x: set(''.join(x)) <= set(DNA) aln = aln.filtered(filt, motif_length=3) flat_lf, time = _fit(aln, tree, model, code, omega_indep, model_gaps, indel_indep) return { 'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name, 'omega_indep': omega_indep, 'model_gaps': model_gaps, 'indel_indep': indel_indep }
def main(): new_dna_dict = dict() new_aa_dict = dict() dna_dict = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) #unaligned_DNA = LoadSeqs(infile, moltype = DNA, aligned = False, format = 'fasta') #help(unaligned_DNA) #unaligned_DNA = unaligned_DNA.withoutTerminalStopCodons() # help(unaligned_DNA) for seqname, sequence in dna_dict.iteritems(): #help(seqitem) new_aa_seq, new_dna_seq = findBestSeq(sequence) new_aa_dict[seqname] = new_aa_seq new_dna_dict[seqname] = new_dna_seq unaligned_aa = LoadSeqs(data=new_aa_dict, moltype = PROTEIN, aligned = False) new_unaligned_dna = LoadSeqs(data=new_dna_dict, moltype = DNA, aligned = False) # print seqitem.getName() # while seqitem.hasTerminalStop() == True: # print "\nold last sic = "+str(seqitem[-6:]) # seqitem = seqitem.withoutTerminalStopCodon() # print "\nnew last sic = "+str(seqitem[-6:]) # unaligned_DNA.writeToFile(outfile+".dna") # unaligned_aa = myTranslate(unaligned_DNA) aa_outstring = unaligned_aa.toFasta() dna_outstring = new_unaligned_dna.toFasta() dna_outfile_temp = outfile.split('.')[0] dna_outfile = dna_outfile_temp+'_dna.fasta' aa_outfile_handle = open(outfile, "w") aa_outfile_handle.write(aa_outstring) aa_outfile_handle.close() dna_outfile_handle = open(dna_outfile, 'w') dna_outfile_handle.write(dna_outstring) dna_outfile_handle.close()
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}): """ This needs to be moved to cogent.align.align """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1,transition=-1,transversion=-1) return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test_paralinear_pair_aa(self): """paralinear shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances()
def test_replaceSeqs(self): """synchronize gaps between protein seqs and codon seqs""" pd = { 'FlyingFox': 'C-TNAH', 'DogFaced': 'CGTNT-', 'FreeTaile': '-GTDTH', 'LittleBro': 'C-TD-H', 'TombBat': 'C--STH' } pal = LoadSeqs(moltype=PROTEIN, data=pd) cu = { 'TombBat': 'TGTAGTACTCAT', 'FreeTaile': 'GGCACAGATACTCAT', 'FlyingFox': 'TGTACAAATGCTCAT', 'LittleBro': 'TGTACAGATCAT', 'DogFaced': 'TGTGGCACAAATACT' } co = LoadSeqs(moltype=DNA, data=cu, aligned=False) cal = pal.replaceSeqs(co) result = cal.todict() for taxon, expected_sequence in [('FlyingFox', 'TGT---ACAAATGCTCAT'), ('DogFaced', 'TGTGGCACAAATACT---'), ('FreeTaile', '---GGCACAGATACTCAT'), ('LittleBro', 'TGT---ACAGAT---CAT'), ('TombBat', 'TGT------AGTACTCAT')]: self.assertEqual(result[taxon], expected_sequence)
def test_getBySequenceAnnotation(self): aln = LoadSeqs(data={"a": "ATCGAAATCGAT", "b": "ATCGA--TCGAT"}) b = aln.getSeq("b") b.addAnnotation(Feature, "test_type", "test_label", [(4, 6)]) answer = aln.getBySequenceAnnotation("b", "test_type")[0].todict() self.assertEqual(answer, {"b": "A--T", "a": "AAAT"})
def pair_hmm_align_unaligned_seqs(seqs,moltype,params={}): """ This needs to be moved to cogent.align.align """ seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1,transition=-1,transversion=-1) return global_pairwise(s1,s2,score_matrix,gap_open,gap_extend)
def test_reversecomplement(self): """test reverse complementing of Alignments and SequenceCollection.""" dna = { 'seq1': '--ACGT--GT---', 'seq2': 'TTACGTA-GT---', 'seq3': '--ACGTA-GCC--' } dna_rc = { 'seq1': '---AC--ACGT--', 'seq2': '---AC-TACGTAA', 'seq3': '--GGC-TACGT--' } # alignment with gaps aln = LoadSeqs(data=dna, moltype=DNA) aln_rc = aln.rc() self.assertEqual(aln_rc.todict(), dna_rc) # check collection, with gaps coll = LoadSeqs(data=dna, moltype=DNA, aligned=False) coll_rc = coll.rc() self.assertEqual(coll_rc.todict(), dna_rc) self.assertEqual(coll_rc.todict(), coll.reversecomplement().todict()) # collection with no gaps dna = {'seq1': 'ACGTGT', 'seq2': 'TTACGTAGT', 'seq3': 'ACGTAGCC'} dna_rc = {'seq1': 'ACACGT', 'seq2': 'ACTACGTAA', 'seq3': 'GGCTACGT'} coll = LoadSeqs(data=dna, moltype=DNA, aligned=False) coll_rc = coll.rc() self.assertEqual(coll_rc.todict(), dna_rc)
def test_getBySequenceAnnotation(self): aln = LoadSeqs(data={'a': 'ATCGAAATCGAT', 'b': 'ATCGA--TCGAT'}) b = aln.getSeq('b') b.addAnnotation(Feature, 'test_type', 'test_label', [(4, 6)]) answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict() self.assertEqual(answer, {'b': 'A--T', 'a': 'AAAT'})
def test_replaceSeqs(self): """synchronize gaps between protein seqs and codon seqs""" pd={'FlyingFox': 'C-TNAH', 'DogFaced': 'CGTNT-', 'FreeTaile': '-GTDTH', 'LittleBro': 'C-TD-H', 'TombBat': 'C--STH'} pal = LoadSeqs(moltype = PROTEIN, data = pd) cu={'TombBat': 'TGTAGTACTCAT', 'FreeTaile': 'GGCACAGATACTCAT', 'FlyingFox': 'TGTACAAATGCTCAT', 'LittleBro': 'TGTACAGATCAT', 'DogFaced': 'TGTGGCACAAATACT'} co = LoadSeqs(moltype = DNA, data = cu, aligned = False) cal = pal.replaceSeqs(co) result = cal.todict() for taxon, expected_sequence in [ ('FlyingFox', 'TGT---ACAAATGCTCAT'), ('DogFaced', 'TGTGGCACAAATACT---'), ('FreeTaile', '---GGCACAGATACTCAT'), ('LittleBro', 'TGT---ACAGAT---CAT'), ('TombBat', 'TGT------AGTACTCAT')]: self.assertEqual(result[taxon], expected_sequence)
def est_logdet_pair_aa(self): """logdet shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.getPairwiseDistances()
def BestLogLikelihood(aln, alphabet=None, exclude_chars = None, allowed_chars='ACGT', motif_length=None, return_length=False): """returns the best log-likelihood according to Goldman 1993. Arguments: - alphabet: a sequence alphabet object. - motif_length: 1 for nucleotide, 2 for dinucleotide, etc .. - exclude_chars: a series of characters used to exclude motifs - allowed_chars: only motifs that contain a subset of these are allowed - return_length: whether to also return the number of alignment columns """ assert alphabet or motif_length, "Must provide either an alphabet or a"\ " motif_length" # need to use the alphabet, so we can enforce character compliance if alphabet: kwargs = dict(moltype=alphabet.MolType) motif_length = alphabet.getMotifLen() else: kwargs = {} aln = LoadSeqs(data=aln.todict(), **kwargs) columns = aligned_columns_to_rows(aln, motif_length, exclude_chars, allowed_chars) num_cols = len(columns) log_likelihood = get_G93_lnL_from_array(columns) if return_length: return log_likelihood, num_cols return log_likelihood
def BestLogLikelihood(aln, alphabet=None, exclude_chars=None, allowed_chars='ACGT', motif_length=None, return_length=False): """returns the best log-likelihood according to Goldman 1993. Arguments: - alphabet: a sequence alphabet object. - motif_length: 1 for nucleotide, 2 for dinucleotide, etc .. - exclude_chars: a series of characters used to exclude motifs - allowed_chars: only motifs that contain a subset of these are allowed - return_length: whether to also return the number of alignment columns """ assert alphabet or motif_length, "Must provide either an alphabet or a"\ " motif_length" # need to use the alphabet, so we can enforce character compliance if alphabet: kwargs = dict(moltype=alphabet.MolType) motif_length = alphabet.getMotifLen() else: kwargs = {} aln = LoadSeqs(data=aln.todict(), **kwargs) columns = aligned_columns_to_rows(aln, motif_length, exclude_chars, allowed_chars) num_cols = len(columns) log_likelihood = get_G93_lnL_from_array(columns) if return_length: return log_likelihood, num_cols return log_likelihood
def test_logdet_pair_aa(self): """logdet shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.getPairwiseDistances()
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}): """ Checks parameters for pairwise alignment, returns alignment. Code from Greg Caporaso. """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError( "Pairwise aligning of seqs requires exactly two seqs.") try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def setUp(self): self.pynast_test1_input_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta) self.pynast_test1_template_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test1_template_fp, 'w').\ write(pynast_test1_template_fasta) self.pynast_test_template_w_dots_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_dots_fp, 'w').\ write(pynast_test1_template_fasta.replace('-', '.')) self.pynast_test_template_w_u_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_u_fp, 'w').\ write(pynast_test1_template_fasta.replace('T', 'U')) self.pynast_test_template_w_lower_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_lower_fp, 'w').\ write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.result_fp, 'w').close() self.failure_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.failure_fp, 'w').close() self.log_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.log') open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = \ LoadSeqs( data=pynast_test1_expected_alignment, aligned=DenseAlignment) self.pynast_test1_expected_fail = \ LoadSeqs(data=pynast_test1_expected_failure, aligned=False)
class TestCigar(unittest.TestCase): def setUp(self): self.cigar_text = '3D2M3D6MDM2D3MD' self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-') self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G') self.map, self.seq = self.aln_seq.parseOutGaps() self.map1, self.seq1 = self.aln_seq1.parseOutGaps() self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)] self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1}) self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)} self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)} def test_map_to_cigar(self): """convert a Map to cigar string""" assert map_to_cigar(self.map) == self.cigar_text def test_cigar_to_map(self): """test generating a Map from cigar""" map = cigar_to_map(self.cigar_text) assert str(map) == str(self.map) def test_aligned_from_cigar(self): """test generating aligned seq from cigar""" aligned_seq = aligned_from_cigar(self.cigar_text, self.seq) assert aligned_seq == self.aln_seq def test_slice_cigar(self): """test slicing cigars""" for start, end in self.slices: # test by_align = True map1, loc1 = slice_cigar(self.cigar_text, start, end) ori1 = self.aln_seq[start:end] if loc1: slicealn1 = self.seq[loc1[0]:loc1[1]].gappedByMap(map1) assert ori1 == slicealn1 else: assert map1.length == len(ori1) # test by_align = False map2, loc2 = slice_cigar(self.cigar_text, start, end, by_align = False) slicealn2 = self.seq[start:end].gappedByMap(map2) ori2 = self.aln_seq[loc2[0]:loc2[1]] assert slicealn2 == ori2 def test_CigarParser(self): """test without slice""" aln = CigarParser(self.seqs, self.cigars) assert aln == self.aln # test slice i = 1 for start, end in self.slices: self.aln.getSeq("FAKE01").addFeature("annot%d"%i, "annot", [(start, end)]) annot = self.aln.getAnnotationsFromAnySequence("annot%d"%i) slice_aln = aln.getRegionCoveringAll(annot).asOneSpan().getSlice() i += 1 cmp_aln = CigarParser(self.seqs, self.cigars, sliced = True, ref_seqname = "FAKE01", start = start, end = end) assert cmp_aln == slice_aln
def _loadfromfile(self, filename, test_write=True, **kw): filename = os.path.join(data_path, filename) aln = LoadSeqs(filename=filename, **kw) if test_write: suffix = filename.split('.')[-1] fn = tempfile.mktemp(suffix='.'+suffix) aln.writeToFile(filename=fn) os.remove(fn)
def _loadfromfile(self, filename, test_write=True, **kw): filename = os.path.join(data_path, filename) aln = LoadSeqs(filename=filename, **kw) if test_write: suffix = filename.split('.')[-1] fn = tempfile.mktemp(suffix='.' + suffix) aln.writeToFile(filename=fn) os.remove(fn)
def test_partimatrix(self): aln = LoadSeqs(filename='data/brca1.fasta', moltype=DNA) species5 = ['Human','HowlerMon','Mouse','NineBande','DogFaced'] aln = aln.takeSeqs(species5) aln = aln[:500] fig = partimatrix(aln, samples=0, display=True, print_stats=False, s_limit=10, title="brca1") test_figure('compatibility', fig)
def setUp(self): self.al = LoadSeqs(data = {'a':'GTACGTACGATC', 'b':'GTACGTACGTAC', 'c':'GTACGTACGTTC', 'e':'GTACGTACTGGT'}) self.collection = LoadSeqs(data = {'a':'GTACGTACGATC', 'b':'GTACGTACGTAC', 'c':'GTACGTACGTTC', 'e':'GTACGTACTGGT'}, aligned=False)
def test_withoutAnyGaps(self): """test removal of all gaps (any entries in alignment column are gaps)""" alignment = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}) align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict() self.assertEqual(align_dict, {'seq1':'ACGTGT', 'seq2':'ACGTGT', 'seq3':'ACGTGT'}) alignment = LoadSeqs(data={'seq1': 'ACGT', 'seq2': '----', 'seq3': '----'}) align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict() self.assertEqual(align_dict, {'seq1':'', 'seq2':'', 'seq3':''})
def test_getBySequenceAnnotation(self): aln = LoadSeqs(data={ 'a': 'ATCGAAATCGAT', 'b': 'ATCGA--TCGAT'}) b = aln.getSeq('b') b.addAnnotation(Feature, 'test_type', 'test_label', [(4,6)]) answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict() self.assertEqual(answer, {'b':'A--T', 'a':'AAAT'})
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError('Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def makeSampleAlignment(): seq1 = makeSampleSequence() seq2 = makeSampleSequence(mid_gaps=True) seqs = {'FAKE01': seq1, 'FAKE02': seq2} aln = LoadSeqs(data = seqs) aln.addAnnotation(Feature, 'misc_feature', 'misc', [(12,25)]) aln.addAnnotation(Feature, 'CDS', 'blue', [(15, 25)]) aln.addAnnotation(Feature, "5'UTR", 'red', [(2, 4)]) aln.addAnnotation(Feature, "LTR", "fake", [(2,15)]) return aln
def setUp(self): self.submodel = Nucleotide(do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates={'beta': 'transition'}) self.data = LoadSeqs(filename=os.path.join(data_path, 'brca1_5.paml'), moltype=self.submodel.MolType) self.tree = LoadTree(filename=os.path.join(data_path, 'brca1_5.tree'))
def setUp(self): """Sets up environment for tests """ self.random_seq = LoadSeqs(data=\ '>seq0\nACUGCGCGGAUCGAUCGAUCGAUCGAUGCAUUUUACGAUCGCCA\n', aligned=False) self.rrna = LoadSeqs(data=RRNA, aligned=False) self.rrna_aln = LoadSeqs(data=REF_ALN) self.seq_db_path = os.path.join(ABSPATH, 'test_data', 'Rfam10_part.fasta')
def setUp(self): self.cigar_text = '3D2M3D6MDM2D3MD' self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-') self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G') self.map, self.seq = self.aln_seq.parseOutGaps() self.map1, self.seq1 = self.aln_seq1.parseOutGaps() self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)] self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1}) self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)} self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={'Algorithm':'first', 'ChoiceF':first_id}) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) exp = rep_seqs_reference_result_file_exp self.assertEqual(LoadSeqs(self.result_filepath,aligned=False), LoadSeqs(data=exp,aligned=False))
def test_partimatrix(self): aln = LoadSeqs(filename='data/brca1.fasta', moltype=DNA) species5 = ['Human', 'HowlerMon', 'Mouse', 'NineBande', 'DogFaced'] aln = aln.takeSeqs(species5) aln = aln[:500] fig = partimatrix(aln, samples=0, display=True, print_stats=False, s_limit=10, title="brca1") test_figure('compatibility', fig)
def test_translate(self): for seqs in [ {'seq1': 'GATTTT', 'seq2': 'GATC??'}, {'seq1': 'GAT---', 'seq2': '?GATCT'}]: alignment = LoadSeqs(data=seqs, moltype=DNA) self.assertEqual(len(alignment.getTranslation()), 2) # check for a failure when no moltype specified alignment = LoadSeqs(data=seqs) try: peps = alignment.getTranslation() except AttributeError: pass
def ml(doc, model='GNC', gc=None, outgroup=None, neutral=None, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) flat_lf, time = _fit(aln, tree, model, code, outgroup, neutral) return {'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name}
def optimization(result, aln, tree1, tree2): # get the sites for each tree according to the assignments aln1 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')], moltype=DNA) aln2 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')], moltype=DNA) for i in range(len(aln)): if (result[i] == 1): aln1 = aln1 + aln[i] if (result[i] == 2): aln2 = aln2 + aln[i] if (result[i] == 0): aln1 = aln1 + aln[i] aln2 = aln2 + aln[i] tree_parameter = [[], []] modle = JC69() # calculate the likelihood and do optimization. optimise will generates # new tree parameters lf1 = modle.makeLikelihoodFunction(tree1) lf1.setAlignment(aln1) lf1.optimise(local=True) likelihood1 = lf1.getLogLikelihood() # new tree parameters generates by optimise. As tree1/2 is symmetric, get # p,q,r from 6 branch lengths p1 = (lf1.getParamValue('length', 'a') + lf1.getParamValue('length', 'c')) / 2.0 q1 = (lf1.getParamValue('length', 'b') + lf1.getParamValue('length', 'd')) / 2.0 r1 = lf1.getParamValue('length', 'edge.1') + \ lf1.getParamValue('length', 'edge.0') lf2 = modle.makeLikelihoodFunction(tree2) lf2.setAlignment(aln2) lf2.optimise(local=True) likelihood2 = lf2.getLogLikelihood() p2 = (lf2.getParamValue('length', 'a') + lf2.getParamValue('length', 'c')) / 2.0 q2 = (lf2.getParamValue('length', 'b') + lf2.getParamValue('length', 'd')) / 2.0 r2 = lf2.getParamValue('length', 'edge.1') + \ lf2.getParamValue('length', 'edge.0') # return the new tree_parameter. As likelihood is in log, so plus together # get the total likelihood for the whole sites tree_parameter[0] = [p1, q1, r1] tree_parameter[1] = [p2, q2, r2] likelihood = likelihood1 + likelihood2 return tree_parameter, likelihood
def test_degap(self): """test stripping gaps from collections and alignments""" aln = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}) observed = aln.degap() expect = {'seq1': 'ACGTGT', 'seq2': 'ACGTAGT', 'seq3': 'ACGTAGT'} self.assertEqual(observed.todict(), expect) collection = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}, aligned=False, moltype=DNA) observed = collection.degap() self.assertEqual(observed.todict(), expect) self.assertEqual(observed.MolType, DNA)
def makeSampleAlignment(): seq1 = makeSampleSequence() seq2 = makeSampleSequence(with_gaps=True) seqs = {'FAKE01': seq1, 'FAKE02': seq2} aln = LoadSeqs(data = seqs) aln.addAnnotation(Feature, 'misc_feature', 'misc', [(12,25)]) aln.addAnnotation(Feature, 'CDS', 'blue', [(15, 25)]) aln.addAnnotation(Feature, "5'UTR", 'red', [(2, 4)]) aln.addAnnotation(Feature, "LTR", "fake", [(2,15)]) return aln
def test_sample_tuples(self): ##### test with motif size != 1 ##### alignment = LoadSeqs(data={'seq1': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP', 'seq2': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP'}) shuffled = alignment.sample(motif_length=2) # ensure length correct sample = alignment.sample(10,motif_length=2) self.assertEqual(len(sample), 20) # test columns alignment preserved seqs = sample.todict().values() self.assertEqual(seqs[0], seqs[1]) # ensure each char occurs twice as sampling dinucs without replacement for char in seqs[0]: self.assertEqual(seqs[0].count(char), 2)
def ml(doc, model='GNCClock', gc=None, outgroup=None, omega_indep=True, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) if model != 'NGClock': # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) ingroup = [t for t in aln.Names if t != outgroup] flat_lf, time = _fit(aln, tree, model, code, ingroup, omega_indep) return {'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name}
def test_withoutRedundantGaps(self): """test removal of redundant gaps (all entries in alignment column are gaps)""" alignment = LoadSeqs( data={ 'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---' }) align_dict = alignment.omitGapPositions().todict() self.assertEqual(align_dict, { 'seq1': 'ACGT-GT', 'seq2': 'ACGTAGT', 'seq3': 'ACGTAGT' })
def test_sample(self): """Test sample generation""" alignment = LoadSeqs(data={'seq1': 'ABCDEFGHIJKLMNOP', 'seq2': 'ABCDEFGHIJKLMNOP'}) # effectively permute columns, preserving length shuffled = alignment.sample() # ensure length correct sample = alignment.sample(10) self.assertEqual(len(sample), 10) # test columns alignment preserved seqs = sample.todict().values() self.assertEqual(seqs[0], seqs[1]) # ensure each char occurs once as sampling without replacement for char in seqs[0]: self.assertEqual(seqs[0].count(char), 1)
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set try: removed_seqs = LoadSeqs(data=removed_seqs, aligned=False) except: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?') output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.toFasta()) output_file2.close() # write a fasta containing the filtered representative seqs try: filtered_seqs = LoadSeqs(data=filtered_seqs, aligned=False) except: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?') output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.toFasta()) output_file.close()
def fileToFrequency(filePath): aln = LoadSeqs(filePath,moltype = PROTEIN) pf = aln.getPosFreqs() pf.normalizePositions() lines = pf.prettyPrint(include_header = True,col_sep=',').split('\n')#每行数据的列表 header_line = lines[0].split(',')#头行数据列表 str_tmp = '' for line in lines[1::]: line_content = line.split(',') for i in range(len(line_content)): if (float(line_content[i].strip()) > 0.01 and header_line[i].strip() != '-' and header_line[i].strip() != 'X'):#1是突变率的阈值,2是是否为-,3是是否为X str_tmp = str_tmp+line_content[i]+','+header_line[i].strip()+'\t' str_tmp = str_tmp +'\n' return str_tmp
def remove_duplicates(seqsin): '''Takes in LoadSeqs loadable sequences, removes duplicate sequences and returns a list of unique sequence tuples, formated (sequence, count) sorted most abundant to least abundant''' parsable_seqs = LoadSeqs(data=seqsin, aligned=False) uniques = {} for header, seq in parsable_seqs.items(): seq = str(seq) if seq in uniques: uniques[seq] += 1 else: uniques[seq] = 1 uniques = uniques.items() uniques.sort(key=lambda x: x[1], reverse=True) return uniques
def test_slidingWindows(self): """test slicing of sequences""" alignment = LoadSeqs(data = {'seq1': 'ACGTACGT', 'seq2': 'ACGTACGT', 'seq3': 'ACGTACGT'}) result = [] for bit in alignment.slidingWindows(5,2): result+=[bit] self.assertEqual(result[0].todict(), {'seq3': 'ACGTA', 'seq2': 'ACGTA', 'seq1': 'ACGTA'}) self.assertEqual(result[1].todict(), {'seq3': 'GTACG', 'seq2': 'GTACG', 'seq1': 'GTACG'}) result = [] for bit in alignment.slidingWindows(5,1): result+=[bit] self.assertEqual(result[0].todict(), {'seq3': 'ACGTA', 'seq2': 'ACGTA', 'seq1': 'ACGTA'}) self.assertEqual(result[1].todict(), {'seq3': 'CGTAC', 'seq2': 'CGTAC', 'seq1': 'CGTAC'}) self.assertEqual(result[2].todict(), {'seq3': 'GTACG', 'seq2': 'GTACG', 'seq1': 'GTACG'}) self.assertEqual(result[3].todict(), {'seq3': 'TACGT', 'seq2': 'TACGT', 'seq1': 'TACGT'})
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln actual_aln = LoadSeqs(self.result_fp, aligned=DenseAlignment) self.assertEqual(actual_aln, expected_aln) actual_fail = LoadSeqs(self.failure_fp, aligned=False) self.assertEqual(actual_fail.toFasta(), self.pynast_test1_expected_fail.toFasta())
def test_hasTerminalStops(self): """test truth values for terminal stops""" # seq collections seq_coll = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACG', 'seq3': 'ACGCGT'}, moltype = DNA, aligned=False) assert seq_coll.hasTerminalStops() == True seq_coll = LoadSeqs(data = {'seq1': 'ACGTAC', 'seq2': 'ACGACG', 'seq3': 'ACGCGT'}, moltype = DNA, aligned=False) assert seq_coll.hasTerminalStops() == False # alignments aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGCAA', 'seq3': 'ACGCGT'}, moltype = DNA) assert aln.hasTerminalStops() == True aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGTAG', 'seq3': 'ACGTGA'}, moltype = DNA) assert aln.hasTerminalStops() == True aln = LoadSeqs(data = {'seq1': 'ACGCAA', 'seq2': 'ACGCAA', 'seq3': 'ACGCGT'}, moltype = DNA) assert aln.hasTerminalStops() == False
def setUp(self): self.submodel = Nucleotide( do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates = {'beta': 'transition'}) self.data = LoadSeqs( filename = os.path.join(data_path, 'brca1_5.paml'), moltype = self.submodel.MolType) self.tree = LoadTree( filename = os.path.join(data_path, 'brca1_5.tree'))
def test_call_pynast_test1_file_output_alt_params(self): """PyNastAligner writes correct output files when no seqs align """ aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 1000}) actual = aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") self.assertEqual(getsize(self.result_fp), 0, "No alignable seqs should result in an empty file.") # all seqs reported to fail actual_fail = LoadSeqs(self.failure_fp, aligned=False) self.assertEqual(actual_fail.getNumSeqs(), 3)
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None): if folderout[-1] != "/": folderout += "/" params = { '--usersort': True, '--id': float(simm), '--maxaccepts': 20, '--maxrejects': 500, '--stepwords': 20, '--hsp': 0, '--match': 1, '--mismatch': -1 } if gapopen is not None: params['--gapopen'] = gapopen if gapext is not None: params['--gapext'] = gapext uclust = Uclust(params, WorkingDir='/tmp') input_data = { '--input': seqspath, '--uc': folderout + "clusters.uc", '--log': folderout + "clusters.log" } result = uclust(input_data) clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile']) seqs = LoadSeqs(seqspath, aligned=False) convheader = {} clusterseqs = {} #create dictinary to convert shortened headers to full headers for header in seqs.getSeqNames(): convheader[header.split()[0]] = header #match headers in each cluster to seqs to create cluster tuples list for num, cluster in enumerate(clusters): clusterseqs["cluster_" + str(num)] = [] for header in clusters[cluster]: clusterseqs["cluster_" + str(num)].append((convheader[header], seqs.getSeq(convheader[header]))) return clusterseqs