示例#1
0
def test_populate_parameters():
    """populate_parameters should set up a nested likelihood function"""
    lf_file = open(os.path.join(get_data_dir(), 'brca1_murphy_gtr.json'))
    lf_json = json.load(lf_file)
    lf_GTR = nest.inflate_likelihood_function(lf_json)
    aln = LoadSeqs(os.path.join(get_data_dir(), 'brca1.fasta'))
    lf_GTR.setAlignment(aln)
    model = General(DNA.Alphabet,
                    optimise_motif_probs=True,
                    recode_gaps=True,
                    model_gaps=False)
    lf_General = model.makeLikelihoodFunction(lf_GTR.tree)
    nest.populate_parameters(lf_General, lf_GTR)
    lf_General.setAlignment(aln)
    assert_almost_equal(lf_GTR.getGStatistic(), lf_General.getGStatistic(), 6)

    lf_GTR = nest.inflate_likelihood_function(_GTR)
    lf_General = nest.inflate_likelihood_function(_General)
    for edge in lf_GTR.tree.getTipNames():
        assert not allclose(
            lf_GTR.getPsubForEdge(edge),
            lf_General.getPsubForEdge(edge)), 'models started close'
    nest.populate_parameters(lf_General, lf_GTR)
    for edge in lf_GTR.tree.getTipNames():
        assert_array_almost_equal(lf_GTR.getPsubForEdge(edge),
                                  lf_General.getPsubForEdge(edge))
示例#2
0
def test_GNC():
    with open(os.path.join(get_data_dir(), 'GNC.json')) as infile:
        flat_lf = json.load(infile)

    lf = inflate_likelihood_function(flat_lf, _ml.GNC)
    aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'),
                  codon_position=-1)
    lf.setAlignment(aln)

    flat_again = deflate_likelihood_function(lf)

    assert_almost_equal(flat_lf['EN'].values(), flat_again['EN'].values(), 9)
示例#3
0
def test_joint_reconstruction():
    data_dir = get_data_dir()
    with open(join(data_dir, 'small_cnfgtr.json')) as lf_in:
        flat_cnfgtr = json.load(lf_in)
    model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True)
    cnfgtr = gapped.inflate_likelihood_function(flat_cnfgtr, model)
    aln = get_aln(join(data_dir, 'small_aln.fasta'),
                  filter_gaps=False,
                  codon_position=-1)[:99]
    cnfgtr.setAlignment(aln)

    anc_aln = gapped.joint(cnfgtr)

    def prob_for_col(col):
        p = cnfgtr.getMotifProbs()[col['root']]
        p *= cnfgtr.getPsubForEdge('BterF3')[col['root']][col['BterF3']]
        p *= cnfgtr.getPsubForEdge('OsmaF4')[col['root']][col['OsmaF4']]
        p *= cnfgtr.getPsubForEdge('twoBeesF2')[col['root']][col['twoBeesF2']]
        p *= cnfgtr.getPsubForEdge('AmelF2')[col['twoBeesF2']][col['AmelF2']]
        p *= cnfgtr.getPsubForEdge('AdorF1')[col['twoBeesF2']][col['AdorF1']]
        return p

    flipper = {'A': 'C', 'C': 'G', 'G': 'T', 'T': 'A'}
    for i in range(0, 99, 3):
        col = anc_aln[i:i + 3].todict()
        p = prob_for_col(col)
        if col['twoBeesF2'] == '---':
            col['twoBeesF2'] = 'CAC'
        else:
            col['twoBeesF2'] = \
                col['twoBeesF2'][:2] + flipper[col['twoBeesF2'][2]]
        pm = prob_for_col(col)
        assert_array_less(pm, p)
    def test_command_line(self):
        datadir = get_data_dir()
        logfile = join(self._output, 'nsl.log')
        cmd = 'nonstationary_lengths.py -i ' + datadir + ' -o ' + \
                self._output + ' -l ' + logfile + ' -L DEBUG -c 3 -u 20 -F seq_fit'
        sys.argv = cmd.split()
        from nonstationary_lengths import main
        assert_equal(main(), 0)
        if USING_MPI:
            MPI.COMM_WORLD.Barrier()

        log = ''
        with open(logfile) as logfile:
            log = logfile.read()
        assert_in('Done Mouse/Opossum/Human in ENSG00000111145', log)

        logfile = join(self._output, 'gs.log')
        cmd = 'g_stats.py -o ' + self._output + ' -l ' + logfile + \
                ' -N 1 -u 20 -P 1 -F seq_fit -L DEBUG'
        sys.argv = cmd.split()
        from g_stats import main
        assert_equal(main(), 0)
        if USING_MPI:
            MPI.COMM_WORLD.Barrier()

        log = ''
        with open(logfile) as logfile:
            log = logfile.read()
        assert_in('Done General and Mouse/Opossum/Human in ENSG00000111145',
                  log)
示例#5
0
def test_gapped_CNFGTR():
    aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'),
                  codon_position=-1,
                  filter_gaps=False)
    tree = LoadTree(treestring='(Human,Mouse,Opossum);')
    doc = {'aln': str(aln), 'tree': str(tree)}
    cnfgtr_result = gapped.ml(doc,
                              model='CNFGTR',
                              model_gaps=True,
                              omega_indep=False,
                              indel_indep=False)
    model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True)
    cnfgtr = gapped.inflate_likelihood_function(cnfgtr_result['lf'], model)

    pi = cnfgtr.getMotifProbsByNode()['root'].asarray()
    P = cnfgtr.getPsubForEdge('Human')
    assert_almost_equal(pi.dot(P), pi)

    omega = cnfgtr.getParamValue('omega')
    pi = cnfgtr.getMotifProbs()
    Q = cnfgtr.getRateMatrixForEdge('Human')
    cond_p = pi['CCG'] / sum(pi['CC' + c] for c in 'ACGT')
    ref_cell = Q['CCT']['CCG'] / cond_p
    cond_p = pi['CCC'] / sum(pi['CC' + c] for c in 'ACGT')
    assert_almost_equal(Q['CCA']['CCC'] / cond_p / ref_cell,
                        cnfgtr.getParamValue('A/C'))
    assert_almost_equal(Q['---']['CCC'] / pi['CCC'] / ref_cell,
                        cnfgtr.getParamValue('indel'))
    R = Q.asarray() / pi.asarray()
    assert_almost_equal(R.T, R)
示例#6
0
 def testClock(self):
     ''' clock should fit a clock type model '''
     datadir = data.get_data_dir()
     aln = os.path.join(datadir, 'aln.fasta')
     tree = os.path.join(datadir, 'tree.nwk')
     correct_result = os.path.join(datadir, 'MG94GTRClock.txt')
     test_result = os.path.join(self.tempdir, 'MG94GTRClock.txt')
     args = ['clock', '--model', 'MG94GTR', aln, tree, 'Mouse', test_result]
     runner = click.testing.CliRunner()
     result = runner.invoke(cli.main, args)
     compare_files(test_result, correct_result)
示例#7
0
 def testFit(self):
     ''' fit should fit a model '''
     datadir = data.get_data_dir()
     aln = os.path.join(datadir, 'aln.fasta')
     tree = os.path.join(datadir, 'tree.nwk')
     fit_result = os.path.join(datadir, 'MG94GTR.json')
     correct_result = os.path.join(datadir, 'MG94GTR.bootstrap')
     test_result = os.path.join(self.tempdir, 'MG94GTR.bootstrap')
     args = ['bootstrap', '--num_bootstraps', '1', fit_result, test_result]
     runner = click.testing.CliRunner()
     result = runner.invoke(cli.main, args)
     compare_files(test_result, correct_result)
示例#8
0
def test_distribution():
    """distribution should return empirical distribution for DNA sequence"""
    with GzipFile(os.path.join(get_data_dir(), 'General_1031.fasta.gz')) as ff:
        data = ff.read()
    al = Alignment(data=data).takeSeqs(('Mouse', ))
    distribution = jsd.distribution(al.getSeq('Mouse'))
    st = LoadTree(tip_names=('Mouse', ))
    sm = GTR()
    lf = sm.makeLikelihoodFunction(st)
    lf.setMotifProbsFromData(al)
    probs = lf.getMotifProbs()
    assert_array_almost_equal(array(probs), array(distribution))
示例#9
0
 def testFit(self):
     ''' omega should fit a model with omega constraints'''
     datadir = data.get_data_dir()
     aln = os.path.join(datadir, 'aln.fasta')
     tree = os.path.join(datadir, 'tree.nwk')
     correct_result = os.path.join(datadir, 'Y98.txt')
     test_result = os.path.join(self.tempdir, 'Y98.txt')
     args = [
         'omega', '--model', 'Y98', '--outgroup', 'Mouse', aln, tree,
         test_result
     ]
     runner = click.testing.CliRunner()
     result = runner.invoke(cli.main, args)
     compare_files(test_result, correct_result)
示例#10
0
def generate_alignments():
    from gzip import GzipFile
    from data import get_data_dir
    from os.path import join
    alns = [('GTRplusGamma', _GTRplusGamma['aln_length']),
            ('General', _General['aln_length']), ('GTR', 100000),
            ('General', 100000), ('GTRplusGamma', 100000),
            ('GTRplusGammaClockTest', 100000), ('GTRClockTest', 100000),
            ('GeneralBen', 100000)]
    alns = [('GTRClockTest', 100000), ('GeneralBen', 100000)]
    for model, aln_len in alns:
        lf = nest.inflate_likelihood_function(eval('_' + model))
        aln = lf.simulateAlignment(aln_len)
        filename = '_'.join((model, str(aln_len))) + '.fasta.gz'
        with GzipFile(join(get_data_dir(), filename), 'w') as aln_file:
            aln_file.write(aln.toFasta())
    return 0
示例#11
0
def get_aln(model, aln_len):
    filename = '_'.join((model, str(aln_len))) + '.fasta.gz'
    data = ''
    with GzipFile(os.path.join(get_data_dir(), filename)) as fastafile:
        data = fastafile.read()
    return Alignment(data=data)