Exemplo n.º 1
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(LoadSeqs(data=infile, aligned=False),
                             LoadSeqs(data=actual_seqs, aligned=False))
Exemplo n.º 2
0
 def test_alignadd(self):
     """testing adding one alignment to another."""
     align1= LoadSeqs(data={'a': 'AAAA', 'b': 'TTTT', 'c': 'CCCC'})
     align2 = LoadSeqs(data={'a': 'GGGG', 'b': '----', 'c': 'NNNN'})
     align = align1 + align2
     concatdict = align.todict()
     self.assertEqual(concatdict, {'a': 'AAAAGGGG', 'b': 'TTTT----', 'c': 'CCCCNNNN'})
Exemplo n.º 3
0
def get_paralinear_distances(gene, data_directory=None, third_position=False, **kw):
    filenames = glob.glob(os.path.join(data_directory, gene+'.fasta*'))
    assert len(filenames) == 1, 'Wrong number of alignment files for ' + gene
    filename = filenames[0]
    if filename.endswith('.fasta'):
        with open(filename) as fastafile:
            fastadata = fastafile.read()
    elif filename.endswith('.fasta.gz'):
        with GzipFile(filename) as fastafile:
            fastadata = fastafile.read()
    else:
        raise RuntimeError(gene + ' file could not be read')

    sequences = LoadSeqs(data=fastadata)
    if third_position:
        indices = [(i, i+1) for i in range(len(sequences))[2::3]]
        pos3 = sequences.addFeature('pos3', 'pos3', indices)
        sequences = pos3.getSlice()
    sequences = sequences.filtered(lambda x: set(''.join(x)) <= set(DNA))

    paralinear_calc = ParalinearPair(moltype=DNA, alignment=sequences)
    paralinear_calc.run(show_progress=False)
    dists = paralinear_calc.getPairwiseDistances()

    return {frozenset(k):v for k, v in dists.items()}
Exemplo n.º 4
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(LoadSeqs(data=infile, aligned=False),
                         LoadSeqs(data=actual_seqs, aligned=False))
Exemplo n.º 5
0
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}):
    """
        Checks parameters for pairwise alignment, returns alignment.

        Code from Greg Caporaso.
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError(
            "Pairwise aligning of seqs requires exactly two seqs.")

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(
            match=1, transition=-1, transversion=-1)

    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemplo n.º 6
0
Arquivo: ml.py Projeto: HuttleyLab/gnc
def rooted(doc, rooted_edges=None, gc=None, **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)
    aln = aln.withoutTerminalStopCodons(code)
    aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3)

    sp_kw = dict(upper=20., lower=0.05, is_independent=False)
    sm = MG94GTR(optimise_motif_probs=True)
    init_lf = sm.makeLikelihoodFunction(tree)
    init_lf.setAlignment(aln)
    with init_lf.updatesPostponed():
        for param in init_lf.getParamNames():
            if '/' in param:
                init_lf.setParamRule(param, **sp_kw)
    init_lf.setParamRule('length', edges=rooted_edges, is_independent=False)
    init_lf.optimise(local=True, show_progress=False, limit_action='raise')
    init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False)
    sm = GNC(optimise_motif_probs=True)
    lf = sm.makeLikelihoodFunction(tree)
    lf.setAlignment(aln)
    _populate_parameters(lf, init_lf, **sp_kw)
    for param in lf.getParamNames():
        if '>' in param or param == 'omega':
            lf.setParamRule(param, edges=rooted_edges, is_independent=False)
    lf.optimise(local=True, show_progress=False, limit_action='raise')
    flat_lf = nest.deflate_likelihood_function(lf)
    flat_lf['hard_up'] = _is_hard_up(lf)

    return {'lf': flat_lf, 'gc': code.Name, 'rooted_edges': rooted_edges}
Exemplo n.º 7
0
def ml(doc,
       model='NG',
       gc=None,
       omega_indep=True,
       model_gaps=False,
       indel_indep=True,
       **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)
    if model != 'NG':
        # Trim terminal stop codons
        aln = aln.withoutTerminalStopCodons(code)
        if model_gaps:
            filt = lambda x: set(''.join(x)) <= set(DNA).union({'-'})
        else:
            filt = lambda x: set(''.join(x)) <= set(DNA)
        aln = aln.filtered(filt, motif_length=3)

    flat_lf, time = _fit(aln, tree, model, code, omega_indep, model_gaps,
                         indel_indep)
    return {
        'lf': flat_lf,
        'time': time,
        'model': model,
        'gc': code.Name,
        'omega_indep': omega_indep,
        'model_gaps': model_gaps,
        'indel_indep': indel_indep
    }
def main():
    new_dna_dict = dict()
    new_aa_dict = dict()
    dna_dict = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
    #unaligned_DNA = LoadSeqs(infile, moltype = DNA, aligned = False, format = 'fasta')
    #help(unaligned_DNA)
    #unaligned_DNA = unaligned_DNA.withoutTerminalStopCodons()
#    help(unaligned_DNA)
    for seqname, sequence in dna_dict.iteritems():
        
        #help(seqitem)
        new_aa_seq, new_dna_seq = findBestSeq(sequence)
        new_aa_dict[seqname] = new_aa_seq
        new_dna_dict[seqname] = new_dna_seq
    unaligned_aa = LoadSeqs(data=new_aa_dict, moltype = PROTEIN, aligned = False)
    new_unaligned_dna = LoadSeqs(data=new_dna_dict, moltype = DNA, aligned = False)
#        print seqitem.getName()
#        while seqitem.hasTerminalStop() == True:
#            print "\nold last sic = "+str(seqitem[-6:])
#            seqitem = seqitem.withoutTerminalStopCodon()
#            print "\nnew last sic = "+str(seqitem[-6:])
#    unaligned_DNA.writeToFile(outfile+".dna")
#    unaligned_aa = myTranslate(unaligned_DNA)
    aa_outstring = unaligned_aa.toFasta()
    dna_outstring = new_unaligned_dna.toFasta()
    dna_outfile_temp = outfile.split('.')[0]
    dna_outfile = dna_outfile_temp+'_dna.fasta'
    aa_outfile_handle = open(outfile, "w")
    aa_outfile_handle.write(aa_outstring)
    aa_outfile_handle.close()
    dna_outfile_handle = open(dna_outfile, 'w')
    dna_outfile_handle.write(dna_outstring)
    dna_outfile_handle.close()
Exemplo n.º 9
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                LoadSeqs(data=infile, aligned=False),
                LoadSeqs(data=actual_seqs, aligned=False))
Exemplo n.º 10
0
def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}):
    """
        This needs to be moved to cogent.align.align
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1,transition=-1,transversion=-1)

    return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemplo n.º 11
0
 def test_paralinear_pair_aa(self):
     """paralinear shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
Exemplo n.º 12
0
    def test_replaceSeqs(self):
        """synchronize gaps between protein seqs and codon seqs"""
        pd = {
            'FlyingFox': 'C-TNAH',
            'DogFaced': 'CGTNT-',
            'FreeTaile': '-GTDTH',
            'LittleBro': 'C-TD-H',
            'TombBat': 'C--STH'
        }
        pal = LoadSeqs(moltype=PROTEIN, data=pd)

        cu = {
            'TombBat': 'TGTAGTACTCAT',
            'FreeTaile': 'GGCACAGATACTCAT',
            'FlyingFox': 'TGTACAAATGCTCAT',
            'LittleBro': 'TGTACAGATCAT',
            'DogFaced': 'TGTGGCACAAATACT'
        }

        co = LoadSeqs(moltype=DNA, data=cu, aligned=False)
        cal = pal.replaceSeqs(co)
        result = cal.todict()
        for taxon, expected_sequence in [('FlyingFox', 'TGT---ACAAATGCTCAT'),
                                         ('DogFaced', 'TGTGGCACAAATACT---'),
                                         ('FreeTaile', '---GGCACAGATACTCAT'),
                                         ('LittleBro', 'TGT---ACAGAT---CAT'),
                                         ('TombBat', 'TGT------AGTACTCAT')]:
            self.assertEqual(result[taxon], expected_sequence)
Exemplo n.º 13
0
    def test_getBySequenceAnnotation(self):
        aln = LoadSeqs(data={"a": "ATCGAAATCGAT", "b": "ATCGA--TCGAT"})
        b = aln.getSeq("b")
        b.addAnnotation(Feature, "test_type", "test_label", [(4, 6)])

        answer = aln.getBySequenceAnnotation("b", "test_type")[0].todict()
        self.assertEqual(answer, {"b": "A--T", "a": "AAAT"})
Exemplo n.º 14
0
def pair_hmm_align_unaligned_seqs(seqs,moltype,params={}):
    """
        This needs to be moved to cogent.align.align
    """
    
    seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."
    
    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1,transition=-1,transversion=-1)
    
    return global_pairwise(s1,s2,score_matrix,gap_open,gap_extend)
Exemplo n.º 15
0
 def test_reversecomplement(self):
     """test reverse complementing of Alignments and SequenceCollection."""
     dna = {
         'seq1': '--ACGT--GT---',
         'seq2': 'TTACGTA-GT---',
         'seq3': '--ACGTA-GCC--'
     }
     dna_rc = {
         'seq1': '---AC--ACGT--',
         'seq2': '---AC-TACGTAA',
         'seq3': '--GGC-TACGT--'
     }
     # alignment with gaps
     aln = LoadSeqs(data=dna, moltype=DNA)
     aln_rc = aln.rc()
     self.assertEqual(aln_rc.todict(), dna_rc)
     # check collection, with gaps
     coll = LoadSeqs(data=dna, moltype=DNA, aligned=False)
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.todict(), dna_rc)
     self.assertEqual(coll_rc.todict(), coll.reversecomplement().todict())
     # collection with no gaps
     dna = {'seq1': 'ACGTGT', 'seq2': 'TTACGTAGT', 'seq3': 'ACGTAGCC'}
     dna_rc = {'seq1': 'ACACGT', 'seq2': 'ACTACGTAA', 'seq3': 'GGCTACGT'}
     coll = LoadSeqs(data=dna, moltype=DNA, aligned=False)
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.todict(), dna_rc)
Exemplo n.º 16
0
    def test_getBySequenceAnnotation(self):
        aln = LoadSeqs(data={'a': 'ATCGAAATCGAT', 'b': 'ATCGA--TCGAT'})
        b = aln.getSeq('b')
        b.addAnnotation(Feature, 'test_type', 'test_label', [(4, 6)])

        answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict()
        self.assertEqual(answer, {'b': 'A--T', 'a': 'AAAT'})
Exemplo n.º 17
0
 def test_paralinear_pair_aa(self):
     """paralinear shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
Exemplo n.º 18
0
    def test_replaceSeqs(self):
        """synchronize gaps between protein seqs and codon seqs"""
        pd={'FlyingFox': 'C-TNAH',
            'DogFaced':  'CGTNT-',
            'FreeTaile': '-GTDTH',
            'LittleBro': 'C-TD-H',
            'TombBat':   'C--STH'}
        pal = LoadSeqs(moltype = PROTEIN, data = pd)

        cu={'TombBat':   'TGTAGTACTCAT',
            'FreeTaile': 'GGCACAGATACTCAT',
            'FlyingFox': 'TGTACAAATGCTCAT',
            'LittleBro': 'TGTACAGATCAT',
            'DogFaced':  'TGTGGCACAAATACT'}

        co = LoadSeqs(moltype = DNA, data = cu, aligned = False)
        cal = pal.replaceSeqs(co)
        result = cal.todict()
        for taxon, expected_sequence in [
                ('FlyingFox', 'TGT---ACAAATGCTCAT'),
                ('DogFaced',   'TGTGGCACAAATACT---'),
                ('FreeTaile', '---GGCACAGATACTCAT'),
                ('LittleBro', 'TGT---ACAGAT---CAT'),
                ('TombBat',  'TGT------AGTACTCAT')]:
            self.assertEqual(result[taxon], expected_sequence)
Exemplo n.º 19
0
 def est_logdet_pair_aa(self):
     """logdet shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.getPairwiseDistances()
Exemplo n.º 20
0
def BestLogLikelihood(aln, alphabet=None, exclude_chars = None,
    allowed_chars='ACGT', motif_length=None, return_length=False):
    """returns the best log-likelihood according to Goldman 1993.
    
    Arguments:
        - alphabet: a sequence alphabet object.
        - motif_length: 1 for nucleotide, 2 for dinucleotide, etc ..
        - exclude_chars: a series of characters used to exclude motifs
        - allowed_chars: only motifs that contain a subset of these are
          allowed
        - return_length: whether to also return the number of alignment columns
    """
    assert alphabet or motif_length, "Must provide either an alphabet or a"\
                                     " motif_length"
    # need to use the alphabet, so we can enforce character compliance
    if alphabet:
        kwargs = dict(moltype=alphabet.MolType)
        motif_length = alphabet.getMotifLen()
    else:
        kwargs = {}
    
    aln = LoadSeqs(data=aln.todict(), **kwargs)
    columns = aligned_columns_to_rows(aln, motif_length, exclude_chars,
                                        allowed_chars)
    num_cols = len(columns)
    log_likelihood = get_G93_lnL_from_array(columns)
    if return_length:
        return log_likelihood, num_cols
    
    return log_likelihood
Exemplo n.º 21
0
def BestLogLikelihood(aln,
                      alphabet=None,
                      exclude_chars=None,
                      allowed_chars='ACGT',
                      motif_length=None,
                      return_length=False):
    """returns the best log-likelihood according to Goldman 1993.
    
    Arguments:
        - alphabet: a sequence alphabet object.
        - motif_length: 1 for nucleotide, 2 for dinucleotide, etc ..
        - exclude_chars: a series of characters used to exclude motifs
        - allowed_chars: only motifs that contain a subset of these are
          allowed
        - return_length: whether to also return the number of alignment columns
    """
    assert alphabet or motif_length, "Must provide either an alphabet or a"\
                                     " motif_length"
    # need to use the alphabet, so we can enforce character compliance
    if alphabet:
        kwargs = dict(moltype=alphabet.MolType)
        motif_length = alphabet.getMotifLen()
    else:
        kwargs = {}

    aln = LoadSeqs(data=aln.todict(), **kwargs)
    columns = aligned_columns_to_rows(aln, motif_length, exclude_chars,
                                      allowed_chars)
    num_cols = len(columns)
    log_likelihood = get_G93_lnL_from_array(columns)
    if return_length:
        return log_likelihood, num_cols

    return log_likelihood
Exemplo n.º 22
0
 def test_logdet_pair_aa(self):
     """logdet shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.getPairwiseDistances()
Exemplo n.º 23
0
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}):
    """
        Checks parameters for pairwise alignment, returns alignment.

        Code from Greg Caporaso.
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError(
            "Pairwise aligning of seqs requires exactly two seqs.")

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(match=1,
                                             transition=-1,
                                             transversion=-1)

    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemplo n.º 24
0
    def setUp(self):
        self.pynast_test1_input_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta)

        self.pynast_test1_template_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test1_template_fp, 'w').\
            write(pynast_test1_template_fasta)

        self.pynast_test_template_w_dots_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_dots_fp, 'w').\
            write(pynast_test1_template_fasta.replace('-', '.'))

        self.pynast_test_template_w_u_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_u_fp, 'w').\
            write(pynast_test1_template_fasta.replace('T', 'U'))

        self.pynast_test_template_w_lower_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_lower_fp, 'w').\
            write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        self.result_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.result_fp, 'w').close()
        self.failure_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.failure_fp, 'w').close()
        self.log_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.log')
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = \
            LoadSeqs(
                data=pynast_test1_expected_alignment,
                aligned=DenseAlignment)
        self.pynast_test1_expected_fail = \
            LoadSeqs(data=pynast_test1_expected_failure, aligned=False)
Exemplo n.º 25
0
class TestCigar(unittest.TestCase):
    def setUp(self):
        self.cigar_text = '3D2M3D6MDM2D3MD'
        self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
        self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
        self.map, self.seq = self.aln_seq.parseOutGaps()
        self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
        self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
        self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
        self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
        self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
    
    def test_map_to_cigar(self):
        """convert a Map to cigar string"""
        assert map_to_cigar(self.map) == self.cigar_text
    
    def test_cigar_to_map(self):
        """test generating a Map from cigar"""
        map = cigar_to_map(self.cigar_text)
        assert str(map) == str(self.map)
    
    def test_aligned_from_cigar(self):
        """test generating aligned seq from cigar"""
        aligned_seq = aligned_from_cigar(self.cigar_text, self.seq)
        assert aligned_seq == self.aln_seq
    
    def test_slice_cigar(self):
        """test slicing cigars"""
        for start, end in self.slices:
            # test by_align = True
            map1, loc1 = slice_cigar(self.cigar_text, start, end)
            ori1 = self.aln_seq[start:end]
            if loc1:
                slicealn1 = self.seq[loc1[0]:loc1[1]].gappedByMap(map1)
                assert ori1 == slicealn1
            else:
                assert map1.length == len(ori1)
            
            # test by_align = False
            map2, loc2 = slice_cigar(self.cigar_text, start, end, by_align = False)
            slicealn2 = self.seq[start:end].gappedByMap(map2)
            ori2 = self.aln_seq[loc2[0]:loc2[1]]
            assert slicealn2 == ori2
    
    def test_CigarParser(self):
        """test without slice"""
        aln = CigarParser(self.seqs, self.cigars)
        assert aln == self.aln
        # test slice
        i = 1
        for start, end in self.slices:
            self.aln.getSeq("FAKE01").addFeature("annot%d"%i, "annot", [(start, end)])
            annot = self.aln.getAnnotationsFromAnySequence("annot%d"%i)
            slice_aln = aln.getRegionCoveringAll(annot).asOneSpan().getSlice()
            i += 1
            
            cmp_aln = CigarParser(self.seqs, self.cigars, sliced = True,
                                  ref_seqname = "FAKE01", start = start, end = end)
            assert cmp_aln == slice_aln 
Exemplo n.º 26
0
class TestCigar(unittest.TestCase):
    def setUp(self):
        self.cigar_text = '3D2M3D6MDM2D3MD'
        self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
        self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
        self.map, self.seq = self.aln_seq.parseOutGaps()
        self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
        self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
        self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
        self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
        self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
    
    def test_map_to_cigar(self):
        """convert a Map to cigar string"""
        assert map_to_cigar(self.map) == self.cigar_text
    
    def test_cigar_to_map(self):
        """test generating a Map from cigar"""
        map = cigar_to_map(self.cigar_text)
        assert str(map) == str(self.map)
    
    def test_aligned_from_cigar(self):
        """test generating aligned seq from cigar"""
        aligned_seq = aligned_from_cigar(self.cigar_text, self.seq)
        assert aligned_seq == self.aln_seq
    
    def test_slice_cigar(self):
        """test slicing cigars"""
        for start, end in self.slices:
            # test by_align = True
            map1, loc1 = slice_cigar(self.cigar_text, start, end)
            ori1 = self.aln_seq[start:end]
            if loc1:
                slicealn1 = self.seq[loc1[0]:loc1[1]].gappedByMap(map1)
                assert ori1 == slicealn1
            else:
                assert map1.length == len(ori1)
            
            # test by_align = False
            map2, loc2 = slice_cigar(self.cigar_text, start, end, by_align = False)
            slicealn2 = self.seq[start:end].gappedByMap(map2)
            ori2 = self.aln_seq[loc2[0]:loc2[1]]
            assert slicealn2 == ori2
    
    def test_CigarParser(self):
        """test without slice"""
        aln = CigarParser(self.seqs, self.cigars)
        assert aln == self.aln
        # test slice
        i = 1
        for start, end in self.slices:
            self.aln.getSeq("FAKE01").addFeature("annot%d"%i, "annot", [(start, end)])
            annot = self.aln.getAnnotationsFromAnySequence("annot%d"%i)
            slice_aln = aln.getRegionCoveringAll(annot).asOneSpan().getSlice()
            i += 1
            
            cmp_aln = CigarParser(self.seqs, self.cigars, sliced = True,
                                  ref_seqname = "FAKE01", start = start, end = end)
            assert cmp_aln == slice_aln 
Exemplo n.º 27
0
 def _loadfromfile(self, filename, test_write=True, **kw):
     filename = os.path.join(data_path, filename)
     aln = LoadSeqs(filename=filename, **kw)
     if test_write:
         suffix = filename.split('.')[-1]
         fn = tempfile.mktemp(suffix='.'+suffix)
         aln.writeToFile(filename=fn)
         os.remove(fn)
Exemplo n.º 28
0
 def _loadfromfile(self, filename, test_write=True, **kw):
     filename = os.path.join(data_path, filename)
     aln = LoadSeqs(filename=filename, **kw)
     if test_write:
         suffix = filename.split('.')[-1]
         fn = tempfile.mktemp(suffix='.' + suffix)
         aln.writeToFile(filename=fn)
         os.remove(fn)
Exemplo n.º 29
0
 def test_partimatrix(self):
     aln = LoadSeqs(filename='data/brca1.fasta', moltype=DNA)
     species5 = ['Human','HowlerMon','Mouse','NineBande','DogFaced']
     aln = aln.takeSeqs(species5)
     aln = aln[:500]
     fig = partimatrix(aln, samples=0, display=True, print_stats=False,
             s_limit=10, title="brca1")
     test_figure('compatibility', fig)
Exemplo n.º 30
0
 def setUp(self):
     self.al = LoadSeqs(data = {'a':'GTACGTACGATC',
                         'b':'GTACGTACGTAC',
                         'c':'GTACGTACGTTC',
                         'e':'GTACGTACTGGT'})
     self.collection = LoadSeqs(data = {'a':'GTACGTACGATC',
                         'b':'GTACGTACGTAC',
                         'c':'GTACGTACGTTC',
                         'e':'GTACGTACTGGT'}, aligned=False)
Exemplo n.º 31
0
 def test_withoutAnyGaps(self):
     """test removal of all gaps (any entries in alignment column are gaps)"""
     alignment = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'})
     align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict()
     self.assertEqual(align_dict, {'seq1':'ACGTGT', 'seq2':'ACGTGT', 'seq3':'ACGTGT'})
         
     alignment = LoadSeqs(data={'seq1': 'ACGT', 'seq2': '----', 'seq3': '----'})
     align_dict = alignment.omitGapPositions(allowed_gap_frac=0).todict()
     self.assertEqual(align_dict, {'seq1':'', 'seq2':'', 'seq3':''})
Exemplo n.º 32
0
 def test_getBySequenceAnnotation(self):
     aln = LoadSeqs(data={
             'a': 'ATCGAAATCGAT',
             'b': 'ATCGA--TCGAT'})
     b = aln.getSeq('b')
     b.addAnnotation(Feature, 'test_type', 'test_label', [(4,6)])
     
     answer = aln.getBySequenceAnnotation('b', 'test_type')[0].todict()
     self.assertEqual(answer, {'b':'A--T', 'a':'AAAT'})
Exemplo n.º 33
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        try:
            template_alignment = LoadSeqs(data=template_alignment, moltype=DNA,
                                          aligned=DenseAlignment)
        except KeyError as e:
            raise KeyError('Only ACGT-. characters can be contained in template alignments.' +
                           ' The offending character was: %s' % e)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            for seq in pynast_failed:
                fail_file.write(seq.toFasta())
                fail_file.write('\n')
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            for seq in pynast_aligned:
                result_file.write(seq.toFasta())
                result_file.write('\n')
            result_file.close()
            return None
        else:
            try:
                return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment)
            except ValueError:
                return {}
Exemplo n.º 34
0
def makeSampleAlignment():
    seq1 = makeSampleSequence()
    seq2 = makeSampleSequence(mid_gaps=True)
    seqs = {'FAKE01': seq1, 'FAKE02': seq2}
    aln = LoadSeqs(data = seqs)
    aln.addAnnotation(Feature, 'misc_feature', 'misc', [(12,25)])
    aln.addAnnotation(Feature, 'CDS', 'blue', [(15, 25)])
    aln.addAnnotation(Feature, "5'UTR", 'red', [(2, 4)])
    aln.addAnnotation(Feature, "LTR", "fake", [(2,15)])
    return aln
Exemplo n.º 35
0
    def setUp(self):
        self.submodel = Nucleotide(do_scaling=True,
                                   model_gaps=False,
                                   equal_motif_probs=True,
                                   predicates={'beta': 'transition'})

        self.data = LoadSeqs(filename=os.path.join(data_path, 'brca1_5.paml'),
                             moltype=self.submodel.MolType)

        self.tree = LoadTree(filename=os.path.join(data_path, 'brca1_5.tree'))
Exemplo n.º 36
0
    def setUp(self):
        """Sets up environment for tests
        """
        self.random_seq = LoadSeqs(data=\
        '>seq0\nACUGCGCGGAUCGAUCGAUCGAUCGAUGCAUUUUACGAUCGCCA\n', aligned=False)

        self.rrna = LoadSeqs(data=RRNA, aligned=False)
        self.rrna_aln = LoadSeqs(data=REF_ALN)
        self.seq_db_path = os.path.join(ABSPATH, 'test_data',
                                        'Rfam10_part.fasta')
Exemplo n.º 37
0
 def setUp(self):
     self.cigar_text = '3D2M3D6MDM2D3MD'
     self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
     self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
     self.map, self.seq = self.aln_seq.parseOutGaps()
     self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
     self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
     self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
     self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
     self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
Exemplo n.º 38
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={'Algorithm':'first', 
         'ChoiceF':first_id})
     app(self.tmp_seq_filepath,
               self.tmp_otu_filepath,
               self.ref_seq_filepath,
               result_path=self.result_filepath)
     exp = rep_seqs_reference_result_file_exp
     self.assertEqual(LoadSeqs(self.result_filepath,aligned=False), 
                      LoadSeqs(data=exp,aligned=False))
Exemplo n.º 39
0
 def test_partimatrix(self):
     aln = LoadSeqs(filename='data/brca1.fasta', moltype=DNA)
     species5 = ['Human', 'HowlerMon', 'Mouse', 'NineBande', 'DogFaced']
     aln = aln.takeSeqs(species5)
     aln = aln[:500]
     fig = partimatrix(aln,
                       samples=0,
                       display=True,
                       print_stats=False,
                       s_limit=10,
                       title="brca1")
     test_figure('compatibility', fig)
Exemplo n.º 40
0
 def test_translate(self):
     for seqs in [
             {'seq1': 'GATTTT', 'seq2': 'GATC??'}, 
             {'seq1': 'GAT---', 'seq2': '?GATCT'}]:
         alignment = LoadSeqs(data=seqs, moltype=DNA)
         self.assertEqual(len(alignment.getTranslation()), 2)
         # check for a failure when no moltype specified
         alignment = LoadSeqs(data=seqs)
         try:
             peps = alignment.getTranslation()
         except AttributeError:
             pass
Exemplo n.º 41
0
def ml(doc, model='GNC', gc=None, outgroup=None, neutral=None, **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)

    # Trim terminal stop codons
    aln = aln.withoutTerminalStopCodons(code)
    aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3)

    flat_lf, time = _fit(aln, tree, model, code, outgroup, neutral)
    return {'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name}
Exemplo n.º 42
0
def optimization(result, aln, tree1, tree2):

    # get the sites for each tree according to the assignments
    aln1 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')],
                    moltype=DNA)
    aln2 = LoadSeqs(data=[('a', ''), ('c', ''), ('b', ''), ('d', '')],
                    moltype=DNA)
    for i in range(len(aln)):
        if (result[i] == 1):
            aln1 = aln1 + aln[i]
        if (result[i] == 2):
            aln2 = aln2 + aln[i]
        if (result[i] == 0):
            aln1 = aln1 + aln[i]
            aln2 = aln2 + aln[i]
    tree_parameter = [[], []]
    modle = JC69()

    # calculate the likelihood and do optimization. optimise will generates
    # new tree parameters
    lf1 = modle.makeLikelihoodFunction(tree1)
    lf1.setAlignment(aln1)
    lf1.optimise(local=True)
    likelihood1 = lf1.getLogLikelihood()

    # new tree parameters generates by optimise. As tree1/2 is symmetric, get
    # p,q,r from 6 branch lengths
    p1 = (lf1.getParamValue('length', 'a') +
          lf1.getParamValue('length', 'c')) / 2.0
    q1 = (lf1.getParamValue('length', 'b') +
          lf1.getParamValue('length', 'd')) / 2.0
    r1 = lf1.getParamValue('length', 'edge.1') + \
        lf1.getParamValue('length', 'edge.0')

    lf2 = modle.makeLikelihoodFunction(tree2)
    lf2.setAlignment(aln2)
    lf2.optimise(local=True)
    likelihood2 = lf2.getLogLikelihood()
    p2 = (lf2.getParamValue('length', 'a') +
          lf2.getParamValue('length', 'c')) / 2.0
    q2 = (lf2.getParamValue('length', 'b') +
          lf2.getParamValue('length', 'd')) / 2.0
    r2 = lf2.getParamValue('length', 'edge.1') + \
        lf2.getParamValue('length', 'edge.0')

    # return the new tree_parameter. As likelihood is in log, so plus together
    # get the total likelihood for the whole sites
    tree_parameter[0] = [p1, q1, r1]
    tree_parameter[1] = [p2, q2, r2]
    likelihood = likelihood1 + likelihood2

    return tree_parameter, likelihood
Exemplo n.º 43
0
 def test_degap(self):
     """test stripping gaps from collections and alignments"""
     aln = LoadSeqs(data={'seq1': '--ACGT--GT---', 'seq2': '--ACGTA-GT---',
                 'seq3': '--ACGTA-GT---'})
     observed = aln.degap()
     expect = {'seq1': 'ACGTGT', 'seq2': 'ACGTAGT', 'seq3': 'ACGTAGT'}
     self.assertEqual(observed.todict(), expect)
     collection = LoadSeqs(data={'seq1': '--ACGT--GT---',
                 'seq2': '--ACGTA-GT---', 'seq3': '--ACGTA-GT---'},
                 aligned=False, moltype=DNA)
     observed = collection.degap()
     self.assertEqual(observed.todict(), expect)
     self.assertEqual(observed.MolType, DNA)
Exemplo n.º 44
0
def makeSampleAlignment():
    seq1 = makeSampleSequence()
    seq2 = makeSampleSequence(with_gaps=True)
    seqs = {'FAKE01': seq1, 'FAKE02': seq2}
    aln = LoadSeqs(data = seqs)
    aln.addAnnotation(Feature, 'misc_feature', 'misc', [(12,25)])
    aln.addAnnotation(Feature, 'CDS', 'blue', [(15, 25)])
    aln.addAnnotation(Feature, "5'UTR", 'red', [(2, 4)])
    aln.addAnnotation(Feature, "LTR", "fake", [(2,15)])
    return aln
Exemplo n.º 45
0
 def test_sample_tuples(self):
     ##### test with motif size != 1 #####
     alignment = LoadSeqs(data={'seq1': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP',
                                 'seq2': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP'})
     shuffled = alignment.sample(motif_length=2)
     # ensure length correct
     sample = alignment.sample(10,motif_length=2)
     self.assertEqual(len(sample), 20)
     # test columns alignment preserved
     seqs = sample.todict().values()
     self.assertEqual(seqs[0], seqs[1])
     # ensure each char occurs twice as sampling dinucs without replacement
     for char in seqs[0]:
         self.assertEqual(seqs[0].count(char), 2)
Exemplo n.º 46
0
 def test_sample_tuples(self):
     ##### test with motif size != 1 #####
     alignment = LoadSeqs(data={'seq1': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP',
                                 'seq2': 'AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP'})
     shuffled = alignment.sample(motif_length=2)
     # ensure length correct
     sample = alignment.sample(10,motif_length=2)
     self.assertEqual(len(sample), 20)
     # test columns alignment preserved
     seqs = sample.todict().values()
     self.assertEqual(seqs[0], seqs[1])
     # ensure each char occurs twice as sampling dinucs without replacement
     for char in seqs[0]:
         self.assertEqual(seqs[0].count(char), 2)
Exemplo n.º 47
0
def ml(doc, model='GNCClock', gc=None, outgroup=None, omega_indep=True, **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)
    if model != 'NGClock':
        # Trim terminal stop codons
        aln = aln.withoutTerminalStopCodons(code)
        aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA),
                           motif_length=3)

    ingroup = [t for t in aln.Names if t != outgroup]
    flat_lf, time = _fit(aln, tree, model, code, ingroup, omega_indep)
    return {'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name}
Exemplo n.º 48
0
 def test_withoutRedundantGaps(self):
     """test removal of redundant gaps (all entries in alignment column are gaps)"""
     alignment = LoadSeqs(
         data={
             'seq1': '--ACGT--GT---',
             'seq2': '--ACGTA-GT---',
             'seq3': '--ACGTA-GT---'
         })
     align_dict = alignment.omitGapPositions().todict()
     self.assertEqual(align_dict, {
         'seq1': 'ACGT-GT',
         'seq2': 'ACGTAGT',
         'seq3': 'ACGTAGT'
     })
Exemplo n.º 49
0
 def test_sample(self):
     """Test sample generation"""
     alignment = LoadSeqs(data={'seq1': 'ABCDEFGHIJKLMNOP',
                                 'seq2': 'ABCDEFGHIJKLMNOP'})
     # effectively permute columns, preserving length
     shuffled = alignment.sample()
     # ensure length correct
     sample = alignment.sample(10)
     self.assertEqual(len(sample), 10)
     # test columns alignment preserved
     seqs = sample.todict().values()
     self.assertEqual(seqs[0], seqs[1])
     # ensure each char occurs once as sampling without replacement
     for char in seqs[0]:
         self.assertEqual(seqs[0].count(char), 1)
Exemplo n.º 50
0
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    try:
        removed_seqs = LoadSeqs(data=removed_seqs, aligned=False)
    except:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?')
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.toFasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    try:
        filtered_seqs = LoadSeqs(data=filtered_seqs, aligned=False)
    except:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?')

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.toFasta())
    output_file.close()
Exemplo n.º 51
0
def fileToFrequency(filePath):
	aln = LoadSeqs(filePath,moltype = PROTEIN)
	pf = aln.getPosFreqs()
	pf.normalizePositions()
	lines = pf.prettyPrint(include_header = True,col_sep=',').split('\n')#每行数据的列表
	header_line = lines[0].split(',')#头行数据列表

	str_tmp = ''
	for line in lines[1::]:

	    line_content = line.split(',')
	    for i in range(len(line_content)):
		if (float(line_content[i].strip()) > 0.01 and header_line[i].strip() != '-' and header_line[i].strip() != 'X'):#1是突变率的阈值,2是是否为-,3是是否为X
		   str_tmp = str_tmp+line_content[i]+','+header_line[i].strip()+'\t'
	    str_tmp = str_tmp +'\n'
	return str_tmp
Exemplo n.º 52
0
def remove_duplicates(seqsin):
    '''Takes in LoadSeqs loadable sequences, removes duplicate sequences
    and returns a list of unique sequence tuples, formated (sequence, count)
    sorted most abundant to least abundant'''

    parsable_seqs = LoadSeqs(data=seqsin, aligned=False)
    uniques = {}
    for header, seq in parsable_seqs.items():
        seq = str(seq)
        if seq in uniques:
            uniques[seq] += 1
        else:
            uniques[seq] = 1
    uniques = uniques.items()
    uniques.sort(key=lambda x: x[1], reverse=True)
    return uniques
Exemplo n.º 53
0
 def test_slidingWindows(self):          
     """test slicing of sequences"""      
     alignment = LoadSeqs(data = {'seq1': 'ACGTACGT', 'seq2': 'ACGTACGT', 'seq3': 'ACGTACGT'})
     result = []                          
     for bit in alignment.slidingWindows(5,2):
         result+=[bit]                    
     self.assertEqual(result[0].todict(), {'seq3': 'ACGTA', 'seq2': 'ACGTA', 'seq1': 'ACGTA'})
     self.assertEqual(result[1].todict(), {'seq3': 'GTACG', 'seq2': 'GTACG', 'seq1': 'GTACG'})
                                          
     result = []                          
     for bit in alignment.slidingWindows(5,1):
         result+=[bit]                    
     self.assertEqual(result[0].todict(), {'seq3': 'ACGTA', 'seq2': 'ACGTA', 'seq1': 'ACGTA'})
     self.assertEqual(result[1].todict(), {'seq3': 'CGTAC', 'seq2': 'CGTAC', 'seq1': 'CGTAC'})
     self.assertEqual(result[2].todict(), {'seq3': 'GTACG', 'seq2': 'GTACG', 'seq1': 'GTACG'})
     self.assertEqual(result[3].todict(), {'seq3': 'TACGT', 'seq2': 'TACGT', 'seq1': 'TACGT'})
Exemplo n.º 54
0
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        actual_aln = LoadSeqs(self.result_fp, aligned=DenseAlignment)
        self.assertEqual(actual_aln, expected_aln)

        actual_fail = LoadSeqs(self.failure_fp, aligned=False)
        self.assertEqual(actual_fail.toFasta(),
                         self.pynast_test1_expected_fail.toFasta())
Exemplo n.º 55
0
 def test_hasTerminalStops(self):
     """test truth values for terminal stops"""
     # seq collections
     seq_coll = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACG',
                         'seq3': 'ACGCGT'}, moltype = DNA, aligned=False)
     assert seq_coll.hasTerminalStops() == True
     seq_coll = LoadSeqs(data = {'seq1': 'ACGTAC', 'seq2': 'ACGACG',
                         'seq3': 'ACGCGT'}, moltype = DNA, aligned=False)
     assert seq_coll.hasTerminalStops() == False
     # alignments
     aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGCAA',
                         'seq3': 'ACGCGT'}, moltype = DNA)
     assert aln.hasTerminalStops() == True
     aln = LoadSeqs(data = {'seq1': 'ACGTAA', 'seq2': 'ACGTAG',
                         'seq3': 'ACGTGA'}, moltype = DNA)
     assert aln.hasTerminalStops() == True
     aln = LoadSeqs(data = {'seq1': 'ACGCAA', 'seq2': 'ACGCAA',
                         'seq3': 'ACGCGT'}, moltype = DNA)
     assert aln.hasTerminalStops() == False
Exemplo n.º 56
0
 def setUp(self):
     self.cigar_text = '3D2M3D6MDM2D3MD'
     self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
     self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
     self.map, self.seq = self.aln_seq.parseOutGaps()
     self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
     self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
     self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
     self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
     self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
Exemplo n.º 57
0
 def setUp(self):
     self.submodel = Nucleotide(
         do_scaling=True, model_gaps=False, equal_motif_probs=True,
         predicates = {'beta': 'transition'})
     
     self.data = LoadSeqs(
             filename = os.path.join(data_path, 'brca1_5.paml'),
             moltype = self.submodel.MolType)
     
     self.tree = LoadTree(
             filename = os.path.join(data_path, 'brca1_5.tree'))
Exemplo n.º 58
0
    def test_call_pynast_test1_file_output_alt_params(self):
        """PyNastAligner writes correct output files when no seqs align
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 1000})

        actual = aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        self.assertEqual(getsize(self.result_fp), 0,
                         "No alignable seqs should result in an empty file.")

        # all seqs reported to fail
        actual_fail = LoadSeqs(self.failure_fp, aligned=False)
        self.assertEqual(actual_fail.getNumSeqs(), 3)
Exemplo n.º 59
0
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None):
    if folderout[-1] != "/":
        folderout += "/"

    params = {
        '--usersort': True,
        '--id': float(simm),
        '--maxaccepts': 20,
        '--maxrejects': 500,
        '--stepwords': 20,
        '--hsp': 0,
        '--match': 1,
        '--mismatch': -1
    }
    if gapopen is not None:
        params['--gapopen'] = gapopen
    if gapext is not None:
        params['--gapext'] = gapext
    uclust = Uclust(params, WorkingDir='/tmp')
    input_data = {
        '--input': seqspath,
        '--uc': folderout + "clusters.uc",
        '--log': folderout + "clusters.log"
    }
    result = uclust(input_data)
    clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile'])

    seqs = LoadSeqs(seqspath, aligned=False)
    convheader = {}
    clusterseqs = {}
    #create dictinary to convert shortened headers to full headers
    for header in seqs.getSeqNames():
        convheader[header.split()[0]] = header
    #match headers in each cluster to seqs to create cluster tuples list
    for num, cluster in enumerate(clusters):
        clusterseqs["cluster_" + str(num)] = []
        for header in clusters[cluster]:
            clusterseqs["cluster_" + str(num)].append((convheader[header],
                                              seqs.getSeq(convheader[header])))

    return clusterseqs