Пример #1
0
    def test_get_representatives(self):
        """get_representatives should return the representatives as list of Sequence."""

        result = """>1: 5
ABABABA
>3: 1
BABA
>4: 1
ABABAA
>8: 2
BABBA
"""
        seqs = self.data.iteritems
        mapping = self.mapping
        test_result = list(get_representatives(mapping, seqs()))
        test_result_as_fasta = "".join(map(lambda a: a.to_fasta(),
                                           test_result))

        self.assertEqual(test_result_as_fasta, result)

        # another example
        mapping = {'1': ('a', 'b', 'c'), '2': ('d', 'e', 'f')}
        seqs = [('1', "ACGT"), ('2', "TAGC"), ('a', "TTTTT")]

        observed = list(get_representatives(mapping, seqs))
        expected = [
            BiologicalSequence("ACGT", id="1"),
            BiologicalSequence("TAGC", id='2')
        ]
        self.assertEqual(observed, expected)
Пример #2
0
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix,
                                gap_substitution_score):
    substitution_score = 0
    for aln1_char, aln2_char in product(aln1_chars, aln2_chars):
        if BiologicalSequence.is_gap(aln1_char) or\
           BiologicalSequence.is_gap(aln2_char):
                substitution_score += gap_substitution_score
        else:
            try:
                substitution_score += \
                    substitution_matrix[aln1_char][aln2_char]
            except KeyError:
                offending_chars = \
                    [c for c in (aln1_char, aln2_char)
                     if c not in substitution_matrix]
                raise ValueError(
                    "One of the sequences contains a character that is "
                    "not contained in the substitution matrix. Are you "
                    "using an appropriate substitution matrix for your "
                    "sequence type (e.g., a nucleotide substitution "
                    "matrix does not make sense for aligning protein "
                    "sequences)? Does your sequence contain invalid "
                    "characters? The offending character(s) is: "
                    " %s." % ', '.join(offending_chars))
    substitution_score /= (len(aln1_chars) * len(aln2_chars))
    return substitution_score
Пример #3
0
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix,
                                gap_substitution_score):
    substitution_score = 0
    for aln1_char, aln2_char in product(aln1_chars, aln2_chars):
        if BiologicalSequence.is_gap(aln1_char) or\
           BiologicalSequence.is_gap(aln2_char):
                substitution_score += gap_substitution_score
        else:
            try:
                substitution_score += \
                    substitution_matrix[aln1_char][aln2_char]
            except KeyError:
                offending_chars = \
                    [c for c in (aln1_char, aln2_char)
                     if c not in substitution_matrix]
                raise ValueError(
                    "One of the sequences contains a character that is "
                    "not contained in the substitution matrix. Are you "
                    "using an appropriate substitution matrix for your "
                    "sequence type (e.g., a nucleotide substitution "
                    "matrix does not make sense for aligning protein "
                    "sequences)? Does your sequence contain invalid "
                    "characters? The offending character(s) is: "
                    " %s." % ', '.join(offending_chars))
    substitution_score /= (len(aln1_chars) * len(aln2_chars))
    return substitution_score
Пример #4
0
def MakeGeneraFastas(fin_taxonomy,fin_repset):
    global repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
    fin_repset = open(fin_repset,"U")
    fin_taxonomy = open(fin_taxonomy,"U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset,ignore_comment=True):
        repsetdic[label] = seq
    taxdic = {}
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus[0:3] == "g__":
            genus = genus[3:]
        taxgendic[accessionID] = genus
        taxdic[accessionID] = taxonomyline
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__"+genus+"_seqs.fasta","w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq,id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    cwd = os.getcwd()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
Пример #5
0
 def test_filter_gap_high_entropy_low(self):
     result = filter_positions(self.alignment_with_gaps,
                               self.maximum_gap_frequency_100,
                               self.maximum_position_entropy_10)
     aln = Alignment([
         BiologicalSequence('A-', id="seq1"),
         BiologicalSequence('A-', id="seq2"),
         BiologicalSequence('A-', id="seq3"),
         BiologicalSequence('A-', id="seq4")
     ])
     self.assertEqual(result, aln)
Пример #6
0
def get_representatives(mapping, seqs):
    """Returns representative seqs.

    mapping: The prefix mapping dict

    seqs_fh: An open Fasta filehandle
    """
    for (label, seq) in seqs:
        if(label in mapping):
            seq = BiologicalSequence(
                seq, id="%s: %d" % (label, len(mapping[label]) + 1))
            yield seq.upper()
Пример #7
0
def _clustal_to_alignment(fh, strict=True):
    r"""yields labels and sequences from msa (multiple sequence alignment)

    Parameters
    ----------

    fh : open file object
        An open Clustal file.
    strict : boolean
        Whether or not to raise a ``ClustalFormatError``
        when no labels are found.

    Returns
    -------
    skbio.Alignment
        Alignment object containing aligned biogical sequences

    Raises
    ------
        skbio.util.exception.ClustalFormatError
            If the sequences in `fh` don't have the same sequence length
            or if the sequence ids don't properly match with the subsequences
    Notes
    -----

    Skips any line that starts with a blank.

    ``_clustal_to_alignment`` preserves the order of the sequences from the
    original file.  However, it does use a dict as an intermediate, so
    two sequences can't have the same label. This is probably OK since
    Clustal will refuse to run on a FASTA file in which two sequences have
    the same label, but could potentially cause trouble with manually
    edited files (all the segments of the conflicting sequences would
    be interleaved, possibly in an unpredictable way).

    If the lines have trailing numbers (i.e. Clustal was run with
    `-LINENOS=ON`), silently deletes them. Does not check that the numbers
    actually correspond to the number of chars in the sequence printed so far.

    References
    ----------
    .. [1] Thompson JD, Higgins DG, Gibson TJ,  "CLUSTAL W: improving the
        sensitivity of progressive multiple sequence alignment through sequence
        weighting, position-specific gap penalties and weight matrix choice.
        Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80.

    """

    records = map(_delete_trailing_number, filter(_is_clustal_seq_line, fh))
    data, labels = _label_line_parser(records, last_space, strict)

    aligned_correctly = _check_length(data, labels)
    if not aligned_correctly:
        raise ClustalFormatError("Sequences not aligned properly")
    alns = []
    for key in labels:
        alns.append(BiologicalSequence(id=key, sequence=''.join(data[key])))
    return Alignment(alns)
Пример #8
0
def write_Fasta_from_name_seq_pairs(name_seqs, fh):
    """writes a list of (name,seqs) to filehandle.

    name_seqs: (name,seqs) pair such as from parse_fasta
    fh: an open filehandle
    """
    if fh is None:
        raise ValueError("Need open file handle to write to.")

    for (name, seq) in name_seqs:
        fh.write("%s\n" % BiologicalSequence(seq, id=name).to_fasta())
Пример #9
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

     # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
Пример #10
0
def _coerce_alignment_input_type(seq, disallow_alignment):
    """ Converts variety of types into an skbio.Alignment object
    """
    if isinstance(seq, string_types):
        return Alignment([BiologicalSequence(seq)])
    elif isinstance(seq, BiologicalSequence):
        return Alignment([seq])
    elif isinstance(seq, Alignment):
        if disallow_alignment:
            # This will disallow aligning either a pair of alignments, or an
            # alignment and a sequence. We don't currently support this for
            # local alignment as there is not a clear usecase, and it's also
            # not exactly clear how this would work.
            raise TypeError("Aligning alignments is not currently supported "
                            "with the aligner function that you're calling.")
        else:
            return seq
    else:
        raise TypeError(
            "Unsupported type provided to aligner: %r." % type(seq))
Пример #11
0
def pick_otus(file_path):
    outdir = os.path.join(os.path.dirname(file_path), 'uclust')
    if False:  ## Making fasta format compatible with qiime (for some reason not working, assume user provides it)
        import skbio  ## Im using scikit-bio for fasta I/O (comes with qiime)
        from skbio.sequence import BiologicalSequence
        print "Preprocessing FASTA " + file_path
        file_path1 = '%s_1%s' % tuple(os.path.splitext(file_path))
        outfile = open(file_path1, "w")  ## reformatting fasta
        fastafile = skbio.read(file_path, format='fasta')
        print "Reading " + file_path
        print "File handle: " + str(fastafile)
        for seqcount, rec in enumerate(fastafile):
            print seqcount + rec.__repr__()
            try:
                int(
                    rec.id.split('_')[1]
                )  ## if the sequence adheres to qiime's expected format <sample_id>_<seq_counter>
                skbio.write(rec, 'fasta',
                            outfile)  ## write down the record as is
            except ValueError, IndexError:  ## else: enforce an id format compatible with qiime's otu picker
                rec1 = BiologicalSequence(rec.sequence, "User_%05d" % seqcount)
                skbio.write(rec1, 'fasta', outfile)
        outfile.close()
        file_path = file_path1
Пример #12
0
def _traceback(traceback_matrix, score_matrix, aln1, aln2, start_row,
               start_col, gap_character='-'):
    # cache some values for simpler
    aend = _traceback_encoding['alignment-end']
    match = _traceback_encoding['match']
    vgap = _traceback_encoding['vertical-gap']
    hgap = _traceback_encoding['horizontal-gap']

    # initialize the result alignments
    aln1_sequence_count = aln1.sequence_count()
    aligned_seqs1 = [[] for e in range(aln1_sequence_count)]

    aln2_sequence_count = aln2.sequence_count()
    aligned_seqs2 = [[] for e in range(aln2_sequence_count)]

    current_row = start_row
    current_col = start_col

    best_score = score_matrix[current_row, current_col]
    current_value = None

    while current_value != aend:
        current_value = traceback_matrix[current_row, current_col]

        if current_value == match:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col-1]))
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row-1]))
            current_row -= 1
            current_col -= 1
        elif current_value == vgap:
            for aligned_seq in aligned_seqs1:
                aligned_seq.append('-')
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row-1]))
            current_row -= 1
        elif current_value == hgap:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col-1]))
            for aligned_seq in aligned_seqs2:
                aligned_seq.append('-')
            current_col -= 1
        elif current_value == aend:
            continue
        else:
            raise ValueError(
                "Invalid value in traceback matrix: %s" % current_value)

    for i in range(aln1_sequence_count):
        aligned_seq = ''.join(aligned_seqs1[i][::-1])
        seq_id = _get_seq_id(aln1[i], str(i))
        aligned_seqs1[i] = BiologicalSequence(aligned_seq, id=seq_id)

    for i in range(aln2_sequence_count):
        aligned_seq = ''.join(aligned_seqs2[i][::-1])
        seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count))
        aligned_seqs2[i] = BiologicalSequence(aligned_seq, id=seq_id)

    return (aligned_seqs1, aligned_seqs2, best_score,
            current_col, current_row)
def make_genera_fastas(fin_taxonomy, fin_repset):
    """Takes ITS fasta file representative sequences and sorts the
       OTUs/species into their corresponding genus file.  This allows
       OTUs to be compared to other OTUs from the same genus.

    Parameters
    ----------
    repsetdic : dict
        A dictionary containing the label (key)
        and sequence (value) from ITS representative sequences file.


    repgenlist : list
        A list that contains all unique genera from ITS fasta file.


    taxgendic: dict
        A dictionary containing accession ID (key) and genus only
        from the Unite taxonomy file. ***** not used currently


    repsetIDlist : list
        A list that contains all of the IDs from the representative ITS
        sequences.



    Returns
    ----------

    Examples
    ----------
    Input is a representative sequence fasta file where each sequence
    corresponds to one representative for all of the OTUs in each
    cluster.  Each sequence has an accession ID that corresponds to
    one sequence in the Unite database.

    Example of one representative fasta sequence from the input
    fasta file:


    >>AB015922 Some_comment_ie_sample_location
    CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG
    ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA
    CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA
    CACGGGTGGGAGGGTCGACCCAGCACGC


    Example of a taxonomy line:

    AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes;
    o__Thelephorales;f__Thelephoraceae;g__Thelephora;
    s__Thelephora_terrestris



    This code identifies the genus of all OTUs by looking at the
    accession number from the fasta sequence, then looking at the
    Unite taxonomy file and identifying the genus the sequence
    belongs to. The OTUs then get sorted into genus files that
    have one or more OTUs/species per file.

    """
    global repgenlist
    fin_repset = open(fin_repset, "U")
    fin_taxonomy = open(fin_taxonomy, "U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset, ignore_comment=True):
        repsetdic[label] = seq
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus.startswith("g__"):
            genus = genus[3:]
        taxgendic[accessionID] = genus
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__" + genus + "_seqs.fasta", "w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq, id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repgenlist
Пример #14
0
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True):
    """Returns a FASTA string given an alignment object

    .. note:: Deprecated in scikit-bio 0.2.0-dev
       ``fasta_from_alignment`` will be removed in scikit-bio 0.3.0. It is
       replaced by ``write``, which is a more general method for serializing
       FASTA-formatted files. ``write`` supports multiple file formats by
       taking advantage of scikit-bio's I/O registry system. See
       :mod:`skbio.io` for more details.

    Parameters
    ----------
    aln : Alignment, dict
        alignment or dictionary where the keys are the sequence ids and
        the values are the sequences themselves.
    make_seqlabel : function, optional
        callback function that takes the seq object and returns a label
        ``str``. If ``None`` is passed, the following attributes will try to be
        retrieved in this order and the first to exist will be used:
        ``id``, ``Label`` or ``Name``. In any other case an integer
        with the position of the sequence object will be used.
    line_wrap : int, optional
        line_wrap: a integer for maximum line width, if ``None`` is passed the
        full sequence will be used.
    sort : bool, optional
        Whether or not the sequences should be sorted by their sequence
        id, default value is ``True``.

    Returns
    -------
    str
        FASTA formatted string composed of the objects passed in via `seqs`.

    See Also
    --------
    skbio.parse.sequences.parse_fasta
    skbio.alignment.Alignment

    Examples
    --------
    Formatting a sequence alignment object into a FASTA file.

    >>> from skbio.alignment import Alignment
    >>> from skbio.sequence import DNA
    >>> from skbio.format.sequences import fasta_from_alignment
    >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"),
    ...         DNA("TCC--G-GGCA..", id="seqs2")]
    >>> a1 = Alignment(seqs)
    >>> print fasta_from_alignment(a1)
    >seq1
    ACC--G-GGTA..
    >seqs2
    TCC--G-GGCA..

    """
    warnings.warn(
        "`fasta_from_alignment` is deprecated and will be removed in "
        "scikit-bio 0.3.0. Please update your code to use `skbio.io.write` "
        "or `skbio.Alignment.write`.", DeprecationWarning)

    # check if it's an Alignment object or a dictionary
    if isinstance(aln, Alignment):
        order = aln.ids()
    else:
        order = aln.keys()

    if sort:
        order = sorted(order)

    ordered_seqs = []
    for label in order:
        seq = aln[label]
        if isinstance(seq, str):
            seq = BiologicalSequence(seq, label)
        ordered_seqs.append(seq)
    return fasta_from_sequences(ordered_seqs,
                                make_seqlabel=make_seqlabel,
                                line_wrap=line_wrap)
def make_genera_fastas(fin_taxonomy,fin_repset):
    """Takes ITS fasta file representative sequences and sorts the
       OTUs/species into their corresponding genus file.  This allows
       OTUs to be compared to other OTUs from the same genus.

    Parameters
    ----------
    repsetdic : dict
        A dictionary containing the label (key)
        and sequence (value) from ITS representative sequences file.


    repgenlist : list
        A list that contains all unique genera from ITS fasta file.


    taxgendic: dict
        A dictionary containing accession ID (key) and genus only
        from the Unite taxonomy file. ***** not used currently


    repsetIDlist : list
        A list that contains all of the IDs from the representative ITS
        sequences.



    Returns
    ----------

    Examples
    ----------
    Input is a representative sequence fasta file where each sequence
    corresponds to one representative for all of the OTUs in each
    cluster.  Each sequence has an accession ID that corresponds to
    one sequence in the Unite database.

    Example of one representative fasta sequence from the input
    fasta file:


    >>AB015922 Some_comment_ie_sample_location
    CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG
    ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA
    CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA
    CACGGGTGGGAGGGTCGACCCAGCACGC


    Example of a taxonomy line:

    AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes;
    o__Thelephorales;f__Thelephoraceae;g__Thelephora;
    s__Thelephora_terrestris



    This code identifies the genus of all OTUs by looking at the
    accession number from the fasta sequence, then looking at the
    Unite taxonomy file and identifying the genus the sequence
    belongs to. The OTUs then get sorted into genus files that
    have one or more OTUs/species per file.

    """
    global repgenlist
    fin_repset = open(fin_repset,"U")
    fin_taxonomy = open(fin_taxonomy,"U")
    repsetdic = {}
    for label, seq in parse_fasta(fin_repset,ignore_comment=True):
        repsetdic[label] = seq
    taxgendic = {}
    for line in fin_taxonomy:
        line = line.split("\t")
        accessionID = line[0]
        taxonomyline = line[1]
        genus = taxonomyline.split(";")
        genus = genus[-2]
        if genus.startswith("g__"):
            genus = genus[3:]
        taxgendic[accessionID] = genus
    fin_taxonomy.close()
    fin_repset.close()
    repsetIDlist = []
    repsetIDlist = repsetdic.keys()
    repgenlist = []
    for i in repsetIDlist:
        genus = taxgendic[i]
        if genus not in repgenlist:
            repgenlist.append(genus)
    generaSeqIDdic = {}
    for m in repgenlist:
        IDnumlist = []
        generaSeqIDdic[m] = IDnumlist
    for key in taxgendic:
        if key in repsetIDlist:
            try:
                g = taxgendic[key]
                generaSeqIDdic[g].append(key)
            except:
                continue
    from skbio.sequence import BiologicalSequence
    for genus in generaSeqIDdic:
        fout = open("g__"+genus+"_seqs.fasta","w")
        seqlist = []
        seqlist = generaSeqIDdic[genus]
        for i in seqlist:
            seq = repsetdic[i]
            t = BiologicalSequence(seq,id=i)
            line = (t.to_fasta(terminal_character=""))
            fout.write(line)
            fout.write("\n")
        fout.close()
    for file in os.listdir(cwd):
        if os.path.getsize(file) < 1:
            os.remove(file)
    return repgenlist
Пример #16
0
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True):
    """Returns a FASTA string given an alignment object

    Parameters
    ----------
    aln : Alignment, dict
        alignment or dictionary where the keys are the sequence ids and
        the values are the sequences themselves.
    make_seqlabel : function, optional
        callback function that takes the seq object and returns a label
        ``str``. If ``None`` is passed, the following attributes will try to be
        retrieved in this order and the first to exist will be used:
        ``id``, ``Label`` or ``Name``. In any other case an integer
        with the position of the sequence object will be used.
    line_wrap : int, optional
        line_wrap: a integer for maximum line width, if ``None`` is passed the
        full sequence will be used.
    sort : bool, optional
        Whether or not the sequences should be sorted by their sequence
        id, default value is ``True``.

    Returns
    -------
    str
        FASTA formatted string composed of the objects passed in via `seqs`.

    See Also
    --------
    skbio.parse.sequences.parse_fasta
    skbio.alignment.Alignment

    Examples
    --------
    Formatting a sequence alignment object into a FASTA file.

    >>> from skbio.alignment import Alignment
    >>> from skbio.sequence import DNA
    >>> from skbio.format.sequences import fasta_from_alignment
    >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"),
    ...         DNA("TCC--G-GGCA..", id="seqs2")]
    >>> a1 = Alignment(seqs)
    >>> print fasta_from_alignment(a1)
    >seq1
    ACC--G-GGTA..
    >seqs2
    TCC--G-GGCA..

    """
    # check if it's an Alignment object or a dictionary
    if isinstance(aln, Alignment):
        order = aln.ids()
    else:
        order = aln.keys()

    if sort:
        order = sorted(order)

    ordered_seqs = []
    for label in order:
        seq = aln[label]
        if isinstance(seq, str):
            seq = BiologicalSequence(seq, label)
        ordered_seqs.append(seq)
    return fasta_from_sequences(ordered_seqs,
                                make_seqlabel=make_seqlabel,
                                line_wrap=line_wrap)
def hamming_distance(s1, s2):
    s1 = BiologicalSequence(s1)
    s2 = BiologicalSequence(s2)
    return s1.distance(s2)
def hamming_distance(s1, s2):
    s1 = BiologicalSequence(s1)
    s2 = BiologicalSequence(s2)
    return s1.distance(s2)