def create_seqstructs(cfo, numclusts): seqstructs = [] # read in first cluster and struct currclust = cfo.readline().strip(">").strip() struct = cfo.readline().strip() seqs = [] for header, seq in MinimalFastaParser(cfo): if "cluster_" in header: aln = LoadSeqs(data=seqs, moltype=RNA) seqstructs.append( SeqStructure(struct, ''.join(aln.majorityConsensus()), currclust)) # move on to next structgroup struct = seq seqs = [] currclust = header else: seqs.append((header, seq)) aln = LoadSeqs(data=seqs, moltype=RNA) seqstructs.append( SeqStructure(struct, ''.join(aln.majorityConsensus()), currclust)) if len(seqstructs) != numclusts: raise AssertionError("%i structures, %i clusters. Not all clusters " "folded!" % (len(seqstructs), numclusts)) return seqstructs
def test_replaceSeqs(self): """synchronize gaps between protein seqs and codon seqs""" pd = { 'FlyingFox': 'C-TNAH', 'DogFaced': 'CGTNT-', 'FreeTaile': '-GTDTH', 'LittleBro': 'C-TD-H', 'TombBat': 'C--STH' } pal = LoadSeqs(moltype=PROTEIN, data=pd) cu = { 'TombBat': 'TGTAGTACTCAT', 'FreeTaile': 'GGCACAGATACTCAT', 'FlyingFox': 'TGTACAAATGCTCAT', 'LittleBro': 'TGTACAGATCAT', 'DogFaced': 'TGTGGCACAAATACT' } co = LoadSeqs(moltype=DNA, data=cu, aligned=False) cal = pal.replaceSeqs(co) result = cal.todict() for taxon, expected_sequence in [('FlyingFox', 'TGT---ACAAATGCTCAT'), ('DogFaced', 'TGTGGCACAAATACT---'), ('FreeTaile', '---GGCACAGATACTCAT'), ('LittleBro', 'TGT---ACAGAT---CAT'), ('TombBat', 'TGT------AGTACTCAT')]: self.assertEqual(result[taxon], expected_sequence)
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_alignadd(self): """testing adding one alignment to another.""" align1= LoadSeqs(data={'a': 'AAAA', 'b': 'TTTT', 'c': 'CCCC'}) align2 = LoadSeqs(data={'a': 'GGGG', 'b': '----', 'c': 'NNNN'}) align = align1 + align2 concatdict = align.todict() self.assertEqual(concatdict, {'a': 'AAAAGGGG', 'b': 'TTTT----', 'c': 'CCCCNNNN'})
def test_reversecomplement(self): """test reverse complementing of Alignments and SequenceCollection.""" dna = { 'seq1': '--ACGT--GT---', 'seq2': 'TTACGTA-GT---', 'seq3': '--ACGTA-GCC--' } dna_rc = { 'seq1': '---AC--ACGT--', 'seq2': '---AC-TACGTAA', 'seq3': '--GGC-TACGT--' } # alignment with gaps aln = LoadSeqs(data=dna, moltype=DNA) aln_rc = aln.rc() self.assertEqual(aln_rc.todict(), dna_rc) # check collection, with gaps coll = LoadSeqs(data=dna, moltype=DNA, aligned=False) coll_rc = coll.rc() self.assertEqual(coll_rc.todict(), dna_rc) self.assertEqual(coll_rc.todict(), coll.reversecomplement().todict()) # collection with no gaps dna = {'seq1': 'ACGTGT', 'seq2': 'TTACGTAGT', 'seq3': 'ACGTAGCC'} dna_rc = {'seq1': 'ACACGT', 'seq2': 'ACTACGTAA', 'seq3': 'GGCTACGT'} coll = LoadSeqs(data=dna, moltype=DNA, aligned=False) coll_rc = coll.rc() self.assertEqual(coll_rc.todict(), dna_rc)
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def setUp(self): self.pynast_test1_input_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta) self.pynast_test1_template_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test1_template_fp, 'w').\ write(pynast_test1_template_fasta) self.pynast_test_template_w_dots_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_dots_fp, 'w').\ write(pynast_test1_template_fasta.replace('-', '.')) self.pynast_test_template_w_u_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_u_fp, 'w').\ write(pynast_test1_template_fasta.replace('T', 'U')) self.pynast_test_template_w_lower_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_lower_fp, 'w').\ write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.result_fp, 'w').close() self.failure_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.failure_fp, 'w').close() self.log_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.log') open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = \ LoadSeqs( data=pynast_test1_expected_alignment, aligned=DenseAlignment) self.pynast_test1_expected_fail = \ LoadSeqs(data=pynast_test1_expected_failure, aligned=False)
def test_withoutAnyGaps(self): """test removal of all gaps (any entries in alignment column are gaps)""" alignment = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}) align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict() self.assertEqual(align_dict, {'seq1':'ACGTGT', 'seq2':'ACGTGT', 'seq3':'ACGTGT'}) alignment = LoadSeqs(data={'seq1': 'ACGT', 'seq2': '----', 'seq3': '----'}) align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict() self.assertEqual(align_dict, {'seq1':'', 'seq2':'', 'seq3':''})
def setUp(self): self.al = LoadSeqs(data = {'a':'GTACGTACGATC', 'b':'GTACGTACGTAC', 'c':'GTACGTACGTTC', 'e':'GTACGTACTGGT'}) self.collection = LoadSeqs(data = {'a':'GTACGTACGATC', 'b':'GTACGTACGTAC', 'c':'GTACGTACGTTC', 'e':'GTACGTACTGGT'}, aligned=False)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError('Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def setUp(self): """Sets up environment for tests """ self.random_seq = LoadSeqs(data=\ '>seq0\nACUGCGCGGAUCGAUCGAUCGAUCGAUGCAUUUUACGAUCGCCA\n', aligned=False) self.rrna = LoadSeqs(data=RRNA, aligned=False) self.rrna_aln = LoadSeqs(data=REF_ALN) self.seq_db_path = os.path.join(ABSPATH, 'test_data', 'Rfam10_part.fasta')
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={'Algorithm':'first', 'ChoiceF':first_id}) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) exp = rep_seqs_reference_result_file_exp self.assertEqual(LoadSeqs(self.result_filepath,aligned=False), LoadSeqs(data=exp,aligned=False))
def test_translate(self): for seqs in [ {'seq1': 'GATTTT', 'seq2': 'GATC??'}, {'seq1': 'GAT---', 'seq2': '?GATCT'}]: alignment = LoadSeqs(data=seqs, moltype=DNA) self.assertEqual(len(alignment.getTranslation()), 2) # check for a failure when no moltype specified alignment = LoadSeqs(data=seqs) try: peps = alignment.getTranslation() except AttributeError: pass
def optimization(result, aln, tree1, tree2): # get the sites for each tree according to the assignments aln1 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')], moltype=DNA) aln2 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')], moltype=DNA) for i in range(len(aln)): if (result[i] == 1): aln1 = aln1 + aln[i] if (result[i] == 2): aln2 = aln2 + aln[i] if (result[i] == 0): aln1 = aln1 + aln[i] aln2 = aln2 + aln[i] tree_parameter = [[], []] modle = JC69() # calculate the likelihood and do optimization. optimise will generates # new tree parameters lf1 = modle.makeLikelihoodFunction(tree1) lf1.setAlignment(aln1) lf1.optimise(local=True) likelihood1 = lf1.getLogLikelihood() # new tree parameters generates by optimise. As tree1/2 is symmetric, get # p,q,r from 6 branch lengths p1 = (lf1.getParamValue('length', 'a') + lf1.getParamValue('length', 'c')) / 2.0 q1 = (lf1.getParamValue('length', 'b') + lf1.getParamValue('length', 'd')) / 2.0 r1 = lf1.getParamValue('length', 'edge.1') + \ lf1.getParamValue('length', 'edge.0') lf2 = modle.makeLikelihoodFunction(tree2) lf2.setAlignment(aln2) lf2.optimise(local=True) likelihood2 = lf2.getLogLikelihood() p2 = (lf2.getParamValue('length', 'a') + lf2.getParamValue('length', 'c')) / 2.0 q2 = (lf2.getParamValue('length', 'b') + lf2.getParamValue('length', 'd')) / 2.0 r2 = lf2.getParamValue('length', 'edge.1') + \ lf2.getParamValue('length', 'edge.0') # return the new tree_parameter. As likelihood is in log, so plus together # get the total likelihood for the whole sites tree_parameter[0] = [p1, q1, r1] tree_parameter[1] = [p2, q2, r2] likelihood = likelihood1 + likelihood2 return tree_parameter, likelihood
def test_degap(self): """test stripping gaps from collections and alignments""" aln = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}) observed = aln.degap() expect = {'seq1': 'ACGTGT', 'seq2': 'ACGTAGT', 'seq3': 'ACGTAGT'} self.assertEqual(observed.todict(), expect) collection = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'}, aligned=False, moltype=DNA) observed = collection.degap() self.assertEqual(observed.todict(), expect) self.assertEqual(observed.MolType, DNA)
def test_withoutTerminalStopCodons(self): """test without terminal stop handling""" seq_coll = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGACG', 'seq3': 'ACGCGT'}, moltype = DNA, aligned=False) seq_coll = seq_coll.withoutTerminalStopCodons() seqs = seq_coll.todict() self.assertEqual(seqs['seq1'], 'ACG') # note: not 'acg---' self.assertEqual(seqs['seq2'], 'ACGACG') aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGTGA', 'seq3': 'ACGTAA'}, moltype = DNA) aln = aln.withoutTerminalStopCodons() seqs = aln.todict() self.assertEqual(seqs['seq1'], 'ACG') # note: not 'acg---' self.assertEqual(seqs['seq2'], 'ACG') self.assertEqual(seqs['seq3'], 'ACG')
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set try: removed_seqs = LoadSeqs(data=removed_seqs, aligned=False) except: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?') output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.toFasta()) output_file2.close() # write a fasta containing the filtered representative seqs try: filtered_seqs = LoadSeqs(data=filtered_seqs, aligned=False) except: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?') output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.toFasta()) output_file.close()
def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}): """ This needs to be moved to cogent.align.align """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1,transition=-1,transversion=-1) return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test_logdet_variance(self): """calculate logdet variance consistent with hand calculation""" data = [ ('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT" ), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC" ) ] aln = LoadSeqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) self.assertEqual(logdet_calc.Variances[1, 1], None) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) var = 0. for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] - 1 var /= 16 * len(data[0][1]) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.getPairwiseDistances() self.assertFloatEqual(logdet_calc.Variances[1, 1], var, eps=1e-3)
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table seqs = LoadSeqs(aln_path, Aligned=True, label_to_name=lambda x: x.split()[0]) result = module.build_tree_from_alignment(seqs, moltype=DNA) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def BestLogLikelihood(aln, alphabet=None, exclude_chars=None, allowed_chars='ACGT', motif_length=None, return_length=False): """returns the best log-likelihood according to Goldman 1993. Arguments: - alphabet: a sequence alphabet object. - motif_length: 1 for nucleotide, 2 for dinucleotide, etc .. - exclude_chars: a series of characters used to exclude motifs - allowed_chars: only motifs that contain a subset of these are allowed - return_length: whether to also return the number of alignment columns """ assert alphabet or motif_length, "Must provide either an alphabet or a"\ " motif_length" # need to use the alphabet, so we can enforce character compliance if alphabet: kwargs = dict(moltype=alphabet.MolType) motif_length = alphabet.getMotifLen() else: kwargs = {} aln = LoadSeqs(data=aln.todict(), **kwargs) columns = aligned_columns_to_rows(aln, motif_length, exclude_chars, allowed_chars) num_cols = len(columns) log_likelihood = get_G93_lnL_from_array(columns) if return_length: return log_likelihood, num_cols return log_likelihood
def test_getBySequenceAnnotation(self): aln = LoadSeqs(data={'a': 'ATCGAAATCGAT', 'b': 'ATCGA--TCGAT'}) b = aln.getSeq('b') b.addAnnotation(Feature, 'test_type', 'test_label', [(4, 6)]) answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict() self.assertEqual(answer, {'b': 'A--T', 'a': 'AAAT'})
def test_paralinear_variance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT" ), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC" ) ] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) var = 0. for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] var -= 1 / numpy.sqrt(f[0][i] * f[1][i]) var /= 16 * len(data[0][1]) self.assertFloatEqual(paralinear_calc.Variances[1, 1], var, eps=1e-3)
def test_setMotifProbs(self): """Mprobs supplied to the parameter controller""" model = cogent.evolve.substitution_model.Nucleotide(model_gaps=True, motif_probs=None) lf = model.makeLikelihoodFunction(self.tree, motif_probs_from_align=False) mprobs = {'A': 0.1, 'C': 0.2, 'G': 0.2, 'T': 0.5, '-': 0.0} lf.setMotifProbs(mprobs) self.assertEqual(lf.getMotifProbs(), mprobs) lf.setMotifProbsFromData(self.al[:1], is_const=True) self.assertEqual(lf.getMotifProbs()['G'], 0.6) lf.setMotifProbsFromData(self.al[:1], pseudocount=1) self.assertNotEqual(lf.getMotifProbs()['G'], 0.6) # test with consideration of ambiguous states al = LoadSeqs(data={ 'seq1': 'ACGTAAGNA', 'seq2': 'ACGTANGTC', 'seq3': 'ACGTACGTG' }) lf.setMotifProbsFromData(al, include_ambiguity=True, is_const=True) motif_probs = dict(lf.getMotifProbs()) correct_probs = { 'A': 8.5 / 27, 'C': 5.5 / 27, '-': 0.0, 'T': 5.5 / 27, 'G': 7.5 / 27 } self.assertEqual(motif_probs, correct_probs) self.assertEqual(sum(motif_probs.values()), 1.0)
def test_logdet_pair_aa(self): """logdet shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.getPairwiseDistances()
def setUp(self): self.infernal_test1_input_fp = get_tmp_filename( prefix='InfernalAlignerTests_', suffix='.fasta') open(self.infernal_test1_input_fp, 'w').write(infernal_test1_input_fasta) self.infernal_test1_template_fp = get_tmp_filename( prefix='InfernalAlignerTests_', suffix='template.sto') open(self.infernal_test1_template_fp,'w').\ write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename(prefix='InfernalAlignerTests_', suffix='.fasta') open(self.result_fp, 'w').close() self.log_fp = get_tmp_filename(prefix='InfernalAlignerTests_', suffix='.log') open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = \ LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\ moltype=DNA)
def test_paralinear_pair_aa(self): """paralinear shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances()
def test_paralinear_distance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT" ), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC" ) ] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) dist = -0.25 * numpy.log( numpy.linalg.det(J) / \ numpy.sqrt(f[0].prod() * f[1].prod()) ) self.assertFloatEqual(paralinear_calc.Dists[1, 1], dist, eps=1e-3)
def test_logdet_pair_dna(self): """logdet should produce distances that match MEGA""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.getPairwiseDistances() all_expected = { ('Human', 'NineBande'): 0.075336929999999996, ('NineBande', 'DogFaced'): 0.0898575452, ('DogFaced', 'Human'): 0.1061747919, ('HowlerMon', 'DogFaced'): 0.0934480008, ('Mouse', 'HowlerMon'): 0.26422862920000001, ('NineBande', 'Human'): 0.075336929999999996, ('HowlerMon', 'NineBande'): 0.062202897899999998, ('DogFaced', 'NineBande'): 0.0898575452, ('DogFaced', 'HowlerMon'): 0.0934480008, ('Human', 'DogFaced'): 0.1061747919, ('Mouse', 'Human'): 0.26539976700000001, ('NineBande', 'HowlerMon'): 0.062202897899999998, ('HowlerMon', 'Human'): 0.036571181899999999, ('DogFaced', 'Mouse'): 0.2652555144, ('HowlerMon', 'Mouse'): 0.26422862920000001, ('Mouse', 'DogFaced'): 0.2652555144, ('NineBande', 'Mouse'): 0.22754789210000001, ('Mouse', 'NineBande'): 0.22754789210000001, ('Human', 'Mouse'): 0.26539976700000001, ('Human', 'HowlerMon'): 0.036571181899999999 } for pair in dists: got = dists[pair] expected = all_expected[pair] self.assertFloatEqual(got, expected)
def setUp(self): #length all edges 1 except c=2. b&d transitions all other transverions self.al = LoadSeqs( data={'a':'tata', 'b':'tgtc', 'c':'gcga', 'd':'gaac', 'e':'gagc',}) self.tree = LoadTree(treestring='((a,b),(c,d),e);') self.model = cogent.evolve.substitution_model.Nucleotide( do_scaling=True, equal_motif_probs=True, model_gaps=True)