示例#1
0
def create_seqstructs(cfo, numclusts):
    seqstructs = []
    # read in first cluster and struct
    currclust = cfo.readline().strip(">").strip()
    struct = cfo.readline().strip()
    seqs = []
    for header, seq in MinimalFastaParser(cfo):
        if "cluster_" in header:
            aln = LoadSeqs(data=seqs, moltype=RNA)
            seqstructs.append(
                SeqStructure(struct, ''.join(aln.majorityConsensus()),
                             currclust))
            # move on to next structgroup
            struct = seq
            seqs = []
            currclust = header
        else:
            seqs.append((header, seq))
    aln = LoadSeqs(data=seqs, moltype=RNA)
    seqstructs.append(
        SeqStructure(struct, ''.join(aln.majorityConsensus()), currclust))
    if len(seqstructs) != numclusts:
        raise AssertionError("%i structures, %i clusters. Not all clusters "
                             "folded!" % (len(seqstructs), numclusts))
    return seqstructs
    def test_replaceSeqs(self):
        """synchronize gaps between protein seqs and codon seqs"""
        pd = {
            'FlyingFox': 'C-TNAH',
            'DogFaced': 'CGTNT-',
            'FreeTaile': '-GTDTH',
            'LittleBro': 'C-TD-H',
            'TombBat': 'C--STH'
        }
        pal = LoadSeqs(moltype=PROTEIN, data=pd)

        cu = {
            'TombBat': 'TGTAGTACTCAT',
            'FreeTaile': 'GGCACAGATACTCAT',
            'FlyingFox': 'TGTACAAATGCTCAT',
            'LittleBro': 'TGTACAGATCAT',
            'DogFaced': 'TGTGGCACAAATACT'
        }

        co = LoadSeqs(moltype=DNA, data=cu, aligned=False)
        cal = pal.replaceSeqs(co)
        result = cal.todict()
        for taxon, expected_sequence in [('FlyingFox', 'TGT---ACAAATGCTCAT'),
                                         ('DogFaced', 'TGTGGCACAAATACT---'),
                                         ('FreeTaile', '---GGCACAGATACTCAT'),
                                         ('LittleBro', 'TGT---ACAGAT---CAT'),
                                         ('TombBat', 'TGT------AGTACTCAT')]:
            self.assertEqual(result[taxon], expected_sequence)
示例#3
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(LoadSeqs(data=infile, aligned=False),
                         LoadSeqs(data=actual_seqs, aligned=False))
 def test_alignadd(self):
     """testing adding one alignment to another."""
     align1= LoadSeqs(data={'a': 'AAAA', 'b': 'TTTT', 'c': 'CCCC'})
     align2 = LoadSeqs(data={'a': 'GGGG', 'b': '----', 'c': 'NNNN'})
     align = align1 + align2
     concatdict = align.todict()
     self.assertEqual(concatdict, {'a': 'AAAAGGGG', 'b': 'TTTT----', 'c': 'CCCCNNNN'})
 def test_reversecomplement(self):
     """test reverse complementing of Alignments and SequenceCollection."""
     dna = {
         'seq1': '--ACGT--GT---',
         'seq2': 'TTACGTA-GT---',
         'seq3': '--ACGTA-GCC--'
     }
     dna_rc = {
         'seq1': '---AC--ACGT--',
         'seq2': '---AC-TACGTAA',
         'seq3': '--GGC-TACGT--'
     }
     # alignment with gaps
     aln = LoadSeqs(data=dna, moltype=DNA)
     aln_rc = aln.rc()
     self.assertEqual(aln_rc.todict(), dna_rc)
     # check collection, with gaps
     coll = LoadSeqs(data=dna, moltype=DNA, aligned=False)
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.todict(), dna_rc)
     self.assertEqual(coll_rc.todict(), coll.reversecomplement().todict())
     # collection with no gaps
     dna = {'seq1': 'ACGTGT', 'seq2': 'TTACGTAGT', 'seq3': 'ACGTAGCC'}
     dna_rc = {'seq1': 'ACACGT', 'seq2': 'ACTACGTAA', 'seq3': 'GGCTACGT'}
     coll = LoadSeqs(data=dna, moltype=DNA, aligned=False)
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.todict(), dna_rc)
示例#6
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(LoadSeqs(data=infile, aligned=False),
                             LoadSeqs(data=actual_seqs, aligned=False))
示例#7
0
    def setUp(self):
        self.pynast_test1_input_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta)

        self.pynast_test1_template_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test1_template_fp, 'w').\
            write(pynast_test1_template_fasta)

        self.pynast_test_template_w_dots_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_dots_fp, 'w').\
            write(pynast_test1_template_fasta.replace('-', '.'))

        self.pynast_test_template_w_u_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_u_fp, 'w').\
            write(pynast_test1_template_fasta.replace('T', 'U'))

        self.pynast_test_template_w_lower_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_lower_fp, 'w').\
            write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        self.result_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.result_fp, 'w').close()
        self.failure_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.failure_fp, 'w').close()
        self.log_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.log')
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = \
            LoadSeqs(
                data=pynast_test1_expected_alignment,
                aligned=DenseAlignment)
        self.pynast_test1_expected_fail = \
            LoadSeqs(data=pynast_test1_expected_failure, aligned=False)
 def test_withoutAnyGaps(self):
     """test removal of all gaps (any entries in alignment column are gaps)"""
     alignment = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'})
     align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict()
     self.assertEqual(align_dict, {'seq1':'ACGTGT', 'seq2':'ACGTGT', 'seq3':'ACGTGT'})
         
     alignment = LoadSeqs(data={'seq1': 'ACGT', 'seq2': '----', 'seq3': '----'})
     align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict()
     self.assertEqual(align_dict, {'seq1':'', 'seq2':'', 'seq3':''})
示例#9
0
 def setUp(self):
     self.al = LoadSeqs(data = {'a':'GTACGTACGATC',
                         'b':'GTACGTACGTAC',
                         'c':'GTACGTACGTTC',
                         'e':'GTACGTACTGGT'})
     self.collection = LoadSeqs(data = {'a':'GTACGTACGATC',
                         'b':'GTACGTACGTAC',
                         'c':'GTACGTACGTTC',
                         'e':'GTACGTACTGGT'}, aligned=False)
示例#10
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        try:
            template_alignment = LoadSeqs(data=template_alignment, moltype=DNA,
                                          aligned=DenseAlignment)
        except KeyError as e:
            raise KeyError('Only ACGT-. characters can be contained in template alignments.' +
                           ' The offending character was: %s' % e)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            for seq in pynast_failed:
                fail_file.write(seq.toFasta())
                fail_file.write('\n')
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            for seq in pynast_aligned:
                result_file.write(seq.toFasta())
                result_file.write('\n')
            result_file.close()
            return None
        else:
            try:
                return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment)
            except ValueError:
                return {}
示例#11
0
    def setUp(self):
        """Sets up environment for tests
        """
        self.random_seq = LoadSeqs(data=\
        '>seq0\nACUGCGCGGAUCGAUCGAUCGAUCGAUGCAUUUUACGAUCGCCA\n', aligned=False)

        self.rrna = LoadSeqs(data=RRNA, aligned=False)
        self.rrna_aln = LoadSeqs(data=REF_ALN)
        self.seq_db_path = os.path.join(ABSPATH, 'test_data',
                                        'Rfam10_part.fasta')
示例#12
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={'Algorithm':'first', 
         'ChoiceF':first_id})
     app(self.tmp_seq_filepath,
               self.tmp_otu_filepath,
               self.ref_seq_filepath,
               result_path=self.result_filepath)
     exp = rep_seqs_reference_result_file_exp
     self.assertEqual(LoadSeqs(self.result_filepath,aligned=False), 
                      LoadSeqs(data=exp,aligned=False))
示例#13
0
 def test_translate(self):
     for seqs in [
             {'seq1': 'GATTTT', 'seq2': 'GATC??'}, 
             {'seq1': 'GAT---', 'seq2': '?GATCT'}]:
         alignment = LoadSeqs(data=seqs, moltype=DNA)
         self.assertEqual(len(alignment.getTranslation()), 2)
         # check for a failure when no moltype specified
         alignment = LoadSeqs(data=seqs)
         try:
             peps = alignment.getTranslation()
         except AttributeError:
             pass
示例#14
0
def optimization(result, aln, tree1, tree2):

    # get the sites for each tree according to the assignments
    aln1 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')],
                    moltype=DNA)
    aln2 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')],
                    moltype=DNA)
    for i in range(len(aln)):
        if (result[i] == 1):
            aln1 = aln1 + aln[i]
        if (result[i] == 2):
            aln2 = aln2 + aln[i]
        if (result[i] == 0):
            aln1 = aln1 + aln[i]
            aln2 = aln2 + aln[i]
    tree_parameter = [[], []]
    modle = JC69()

    # calculate the likelihood and do optimization. optimise will generates
    # new tree parameters
    lf1 = modle.makeLikelihoodFunction(tree1)
    lf1.setAlignment(aln1)
    lf1.optimise(local=True)
    likelihood1 = lf1.getLogLikelihood()

    # new tree parameters generates by optimise. As tree1/2 is symmetric, get
    # p,q,r from 6 branch lengths
    p1 = (lf1.getParamValue('length', 'a') +
          lf1.getParamValue('length', 'c')) / 2.0
    q1 = (lf1.getParamValue('length', 'b') +
          lf1.getParamValue('length', 'd')) / 2.0
    r1 = lf1.getParamValue('length', 'edge.1') + \
        lf1.getParamValue('length', 'edge.0')

    lf2 = modle.makeLikelihoodFunction(tree2)
    lf2.setAlignment(aln2)
    lf2.optimise(local=True)
    likelihood2 = lf2.getLogLikelihood()
    p2 = (lf2.getParamValue('length', 'a') +
          lf2.getParamValue('length', 'c')) / 2.0
    q2 = (lf2.getParamValue('length', 'b') +
          lf2.getParamValue('length', 'd')) / 2.0
    r2 = lf2.getParamValue('length', 'edge.1') + \
        lf2.getParamValue('length', 'edge.0')

    # return the new tree_parameter. As likelihood is in log, so plus together
    # get the total likelihood for the whole sites
    tree_parameter[0] = [p1, q1, r1]
    tree_parameter[1] = [p2, q2, r2]
    likelihood = likelihood1 + likelihood2

    return tree_parameter, likelihood
示例#15
0
 def test_degap(self):
     """test stripping gaps from collections and alignments"""
     aln = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---',
                 'seq3': '--ACGTA-GT---'})
     observed = aln.degap()
     expect = {'seq1': 'ACGTGT', 'seq2': 'ACGTAGT', 'seq3': 'ACGTAGT'}
     self.assertEqual(observed.todict(), expect)
     collection = LoadSeqs(data={'seq1': '--ACGT--GT---',
                 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'},
                 aligned=False, moltype=DNA)
     observed = collection.degap()
     self.assertEqual(observed.todict(), expect)
     self.assertEqual(observed.MolType, DNA)
示例#16
0
 def test_withoutTerminalStopCodons(self):
     """test without terminal stop handling"""
     seq_coll = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGACG',
                         'seq3': 'ACGCGT'}, moltype = DNA, aligned=False)
     seq_coll = seq_coll.withoutTerminalStopCodons()
     seqs = seq_coll.todict()
     self.assertEqual(seqs['seq1'], 'ACG')   # note: not 'acg---'
     self.assertEqual(seqs['seq2'], 'ACGACG')
     aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGTGA',
                     'seq3': 'ACGTAA'}, moltype = DNA)
     aln = aln.withoutTerminalStopCodons()
     seqs = aln.todict()
     self.assertEqual(seqs['seq1'], 'ACG')   # note: not 'acg---'
     self.assertEqual(seqs['seq2'], 'ACG')
     self.assertEqual(seqs['seq3'], 'ACG')
示例#17
0
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    try:
        removed_seqs = LoadSeqs(data=removed_seqs, aligned=False)
    except:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?')
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.toFasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    try:
        filtered_seqs = LoadSeqs(data=filtered_seqs, aligned=False)
    except:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?')

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.toFasta())
    output_file.close()
示例#18
0
def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}):
    """
        This needs to be moved to cogent.align.align
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1,transition=-1,transversion=-1)

    return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
示例#19
0
    def test_logdet_variance(self):
        """calculate logdet variance consistent with hand calculation"""
        data = [
            ('seq1',
             "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"
             ),
            ('seq2',
             "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC"
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)
        self.assertEqual(logdet_calc.Variances[1, 1], None)

        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        var = 0.
        for i in range(4):
            for j in range(4):
                var += M[j, i]**2 * J[i, j] - 1
        var /= 16 * len(data[0][1])

        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.getPairwiseDistances()
        self.assertFloatEqual(logdet_calc.Variances[1, 1], var, eps=1e-3)
示例#20
0
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.
        
        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        seqs = LoadSeqs(aln_path,
                        Aligned=True,
                        label_to_name=lambda x: x.split()[0])
        result = module.build_tree_from_alignment(seqs, moltype=DNA)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
示例#21
0
def BestLogLikelihood(aln,
                      alphabet=None,
                      exclude_chars=None,
                      allowed_chars='ACGT',
                      motif_length=None,
                      return_length=False):
    """returns the best log-likelihood according to Goldman 1993.
    
    Arguments:
        - alphabet: a sequence alphabet object.
        - motif_length: 1 for nucleotide, 2 for dinucleotide, etc ..
        - exclude_chars: a series of characters used to exclude motifs
        - allowed_chars: only motifs that contain a subset of these are
          allowed
        - return_length: whether to also return the number of alignment columns
    """
    assert alphabet or motif_length, "Must provide either an alphabet or a"\
                                     " motif_length"
    # need to use the alphabet, so we can enforce character compliance
    if alphabet:
        kwargs = dict(moltype=alphabet.MolType)
        motif_length = alphabet.getMotifLen()
    else:
        kwargs = {}

    aln = LoadSeqs(data=aln.todict(), **kwargs)
    columns = aligned_columns_to_rows(aln, motif_length, exclude_chars,
                                      allowed_chars)
    num_cols = len(columns)
    log_likelihood = get_G93_lnL_from_array(columns)
    if return_length:
        return log_likelihood, num_cols

    return log_likelihood
示例#22
0
    def test_getBySequenceAnnotation(self):
        aln = LoadSeqs(data={'a': 'ATCGAAATCGAT', 'b': 'ATCGA--TCGAT'})
        b = aln.getSeq('b')
        b.addAnnotation(Feature, 'test_type', 'test_label', [(4, 6)])

        answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict()
        self.assertEqual(answer, {'b': 'A--T', 'a': 'AAAT'})
示例#23
0
    def test_paralinear_variance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [
            ('seq1',
             "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"
             ),
            ('seq2',
             "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC"
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)

        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        var = 0.
        for i in range(4):
            for j in range(4):
                var += M[j, i]**2 * J[i, j]
            var -= 1 / numpy.sqrt(f[0][i] * f[1][i])
        var /= 16 * len(data[0][1])

        self.assertFloatEqual(paralinear_calc.Variances[1, 1], var, eps=1e-3)
    def test_setMotifProbs(self):
        """Mprobs supplied to the parameter controller"""
        model = cogent.evolve.substitution_model.Nucleotide(model_gaps=True,
                                                            motif_probs=None)
        lf = model.makeLikelihoodFunction(self.tree,
                                          motif_probs_from_align=False)

        mprobs = {'A': 0.1, 'C': 0.2, 'G': 0.2, 'T': 0.5, '-': 0.0}
        lf.setMotifProbs(mprobs)
        self.assertEqual(lf.getMotifProbs(), mprobs)

        lf.setMotifProbsFromData(self.al[:1], is_const=True)
        self.assertEqual(lf.getMotifProbs()['G'], 0.6)

        lf.setMotifProbsFromData(self.al[:1], pseudocount=1)
        self.assertNotEqual(lf.getMotifProbs()['G'], 0.6)

        # test with consideration of ambiguous states
        al = LoadSeqs(data={
            'seq1': 'ACGTAAGNA',
            'seq2': 'ACGTANGTC',
            'seq3': 'ACGTACGTG'
        })
        lf.setMotifProbsFromData(al, include_ambiguity=True, is_const=True)
        motif_probs = dict(lf.getMotifProbs())
        correct_probs = {
            'A': 8.5 / 27,
            'C': 5.5 / 27,
            '-': 0.0,
            'T': 5.5 / 27,
            'G': 7.5 / 27
        }
        self.assertEqual(motif_probs, correct_probs)
        self.assertEqual(sum(motif_probs.values()), 1.0)
示例#25
0
 def test_logdet_pair_aa(self):
     """logdet shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.getPairwiseDistances()
示例#26
0
    def setUp(self):
        self.infernal_test1_input_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        open(self.infernal_test1_input_fp,
             'w').write(infernal_test1_input_fasta)

        self.infernal_test1_template_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_', suffix='template.sto')
        open(self.infernal_test1_template_fp,'w').\
         write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        self.result_fp = get_tmp_filename(prefix='InfernalAlignerTests_',
                                          suffix='.fasta')
        open(self.result_fp, 'w').close()

        self.log_fp = get_tmp_filename(prefix='InfernalAlignerTests_',
                                       suffix='.log')
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({
            'template_filepath':
            self.infernal_test1_template_fp,
        })
        self.infernal_test1_expected_aln = \
         LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\
            moltype=DNA)
示例#27
0
 def test_paralinear_pair_aa(self):
     """paralinear shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
示例#28
0
    def test_paralinear_distance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [
            ('seq1',
             "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"
             ),
            ('seq2',
             "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC"
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)

        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        dist = -0.25 * numpy.log( numpy.linalg.det(J) / \
                numpy.sqrt(f[0].prod() * f[1].prod()) )

        self.assertFloatEqual(paralinear_calc.Dists[1, 1], dist, eps=1e-3)
示例#29
0
 def test_logdet_pair_dna(self):
     """logdet should produce distances that match MEGA"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.getPairwiseDistances()
     all_expected = {
         ('Human', 'NineBande'): 0.075336929999999996,
         ('NineBande', 'DogFaced'): 0.0898575452,
         ('DogFaced', 'Human'): 0.1061747919,
         ('HowlerMon', 'DogFaced'): 0.0934480008,
         ('Mouse', 'HowlerMon'): 0.26422862920000001,
         ('NineBande', 'Human'): 0.075336929999999996,
         ('HowlerMon', 'NineBande'): 0.062202897899999998,
         ('DogFaced', 'NineBande'): 0.0898575452,
         ('DogFaced', 'HowlerMon'): 0.0934480008,
         ('Human', 'DogFaced'): 0.1061747919,
         ('Mouse', 'Human'): 0.26539976700000001,
         ('NineBande', 'HowlerMon'): 0.062202897899999998,
         ('HowlerMon', 'Human'): 0.036571181899999999,
         ('DogFaced', 'Mouse'): 0.2652555144,
         ('HowlerMon', 'Mouse'): 0.26422862920000001,
         ('Mouse', 'DogFaced'): 0.2652555144,
         ('NineBande', 'Mouse'): 0.22754789210000001,
         ('Mouse', 'NineBande'): 0.22754789210000001,
         ('Human', 'Mouse'): 0.26539976700000001,
         ('Human', 'HowlerMon'): 0.036571181899999999
     }
     for pair in dists:
         got = dists[pair]
         expected = all_expected[pair]
         self.assertFloatEqual(got, expected)
 def setUp(self):
     #length all edges 1 except c=2.  b&d transitions all other transverions
     self.al = LoadSeqs(
         data={'a':'tata', 'b':'tgtc', 'c':'gcga', 'd':'gaac', 'e':'gagc',})
     self.tree = LoadTree(treestring='((a,b),(c,d),e);')
     self.model = cogent.evolve.substitution_model.Nucleotide(
         do_scaling=True, equal_motif_probs=True, model_gaps=True)