Exemplo n.º 1
0
    def test_append_proteins(self):
        self.test_chars.append(Seq.Seq("K", Alphabet.generic_protein))
        self.test_chars.append(
            Seq.Seq("K-", Alphabet.Gapped(Alphabet.generic_protein, "-")))
        self.test_chars.append(
            Seq.Seq("K@", Alphabet.Gapped(IUPAC.protein, "@")))

        self.assertEqual(7, len(self.test_chars))
Exemplo n.º 2
0
    def test_ungap(self):
        seq = Seq.UnknownSeq(7,
                             alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(),
                                                      "-"))
        self.assertEqual("NNNNNNN", str(seq.ungap("-")))

        seq = Seq.UnknownSeq(20,
                             alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(),
                                                      "-"),
                             character='-')
        self.assertEqual("", seq.ungap("-"))
Exemplo n.º 3
0
 def test_exception_when_added_protein_has_more_than_one_stop_codon_type(
         self):
     """Test resulting protein has stop codon types '*' and '@'"""
     a = Seq.Seq(
         "MEDG-KRXR@",
         Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"),
                               "@"))
     b = Seq.Seq(
         "MEDG-KRXR*",
         Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"),
                         "-"))
     with self.assertRaises(ValueError):
         a + b
Exemplo n.º 4
0
    def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
        Alignment.__init__(self, alphabet)

        # represent all of those stars in the aln output format
        self._star_info = ''
        
        self._version = ''
Exemplo n.º 5
0
def replace_stop_codons_with_gapps(aln_file, in_format="fasta", output=None):
    aln_file = check_filename(aln_file)
    if output == None:
        output = aln_file
    else:
        output = check_filename(output, Truefile=False)
    aln = AlignIO.read(aln_file,
                       in_format,
                       alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    stop_codon_count = 0
    for seq in aln:
        new_seq = ""
        for i in range(0, len(seq.seq), 3):
            codon = seq.seq[i:i + 3]
            if "-" in codon:
                new_seq += codon

            elif codon in ["TAA", "TAG", "TGA"]:
                if len(seq.seq) - i == 3:  # the final stop codon
                    new_seq += "---"
                else:
                    new_seq += "---"
                    stop_codon_count += 1
            else:
                new_seq += codon
        seq.seq = new_seq
    SeqIO.write(aln, output, "fasta")
    print("%i replacments of stop codons to ---" % stop_codon_count)
Exemplo n.º 6
0
def remove_gapped_positions_codon(aln_file, output=None, in_format="fasta"):
    """
    removes positions in an alignment which are all gapped
    if output == None - rewrites on the input file
    :param aln_file: input alignment file path
    :param output: output file path (default: None)
    :param in_format: input format (default: fatsa)
    :return: ouptut file path
    """
    aln_file = check_filename(aln_file)
    if output == None:
        output = aln_file
    else:
        output = check_filename(output, Truefile=False)
    aln = AlignIO.read(aln_file,
                       in_format,
                       alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    new_aln = None
    for i in range(0, len(aln[0]), 3):
        position = aln[:, i:i + 3]
        if "".join(set(position[0])) != "-" or "".join(set(
                position[2])) != "-" or "".join(set(position[2])) != "-":
            if new_aln == None:
                new_aln = aln[:, i:i + 3]
            else:
                new_aln = new_aln + aln[:, i:i + 3]

    AlignIO.write(new_aln, output, "fasta")
Exemplo n.º 7
0
    def test_read_fasta(self):
        path = os.path.join(os.curdir, "Quality", "example.fasta")
        alignment = AlignIO.read(path,
                                 "fasta",
                                 alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna))
        self.assertEqual(len(alignment), 3)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324")
        self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC")
        seq_record = alignment[1]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792")
        self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA")
        seq_record = alignment[2]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
        self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
        self.assertEqual(alignment.get_alignment_length(), 25)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
        self.assertEqual(
            str(alignment), """\
Gapped(IUPACAmbiguousDNA(), '-') alignment with 3 rows and 25 columns
CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324
TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792
GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
Exemplo n.º 8
0
def parse_file(file_name, type = 'DNA'):
    """Parse the given file into a FastaAlignment object.

    Arguments:
    o file_name - The location of the file to parse.
    o type - The type of information contained in the file.
    """
    if type.upper() == 'DNA':
        alphabet = IUPAC.ambiguous_dna
    elif type.upper() == 'RNA':
        alphabet = IUPAC.ambiguous_rna
    elif type.upper() == 'PROTEIN':
        alphabet = IUPAC.protein
    else:
        raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN"
                         % type)

    # create a new alignment object
    fasta_align = FastaAlignment(Alphabet.Gapped(alphabet))

    # now parse the file and fill up the alignment object
    align_file = open(file_name, 'r')

    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(align_file, parser)

    cur_align = iterator.next()
    while cur_align:
        fasta_align.add_sequence(cur_align.title, cur_align.sequence)

        cur_align = iterator.next()

    return fasta_align
Exemplo n.º 9
0
def action(arguments):
    """
    Trim the alignment as specified
    """
    # Determine file format for input and output
    source_format = (arguments.source_format
                     or fileformat.from_handle(arguments.source_file))
    output_format = (arguments.output_format
                     or fileformat.from_handle(arguments.output_file))

    # Load the alignment
    with arguments.source_file:
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Locate primers
        (forward_start, forward_end), (reverse_start, reverse_end) = \
                locate_primers(sequences, arguments.forward_primer,
                        arguments.reverse_primer, arguments.reverse_complement,
                        arguments.max_hamming_distance)

        # Generate slice indexes
        if arguments.include_primers:
            start = forward_start
            end = reverse_end + 1
        else:
            start = forward_end + 1
            end = reverse_start

        # Rewind the input file
        arguments.source_file.seek(0)
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Apply the transformation
        prune_action = _ACTIONS[arguments.prune_action]
        transformed_sequences = prune_action(sequences, start, end)

        with arguments.output_file:
            SeqIO.write(transformed_sequences, arguments.output_file,
                        output_format)
Exemplo n.º 10
0
 def setUp(self):
     self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
     self.dna = [
         Seq.Seq("ATCG", IUPAC.ambiguous_dna),
         Seq.Seq("gtca", Alphabet.generic_dna),
         Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
         Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")),
     ]
     self.rna = [
         Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
         Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
         Seq.Seq("uCAg", Alphabet.generic_rna),
         Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna,
                                                 "-")),
         Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")),
     ]
     self.nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide)]
     self.protein = [
         Seq.Seq("ATCGPK", IUPAC.protein),
         Seq.Seq("atcGPK", Alphabet.generic_protein),
         Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
         Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
         Seq.Seq(
             "MEDG-KRXR*",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.MutableSeq(
             "ME-K-DRXR*XU",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.Seq(
             "MEDG-KRXR@",
             Alphabet.HasStopCodon(
                 Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")),
         Seq.Seq(
             "ME-KR@",
             Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"),
                                   "@")),
         Seq.Seq(
             "MEDG.KRXR@",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")),
     ]
     self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
Exemplo n.º 11
0
def read_fasta(filename):
    """
    Reading .fasta files
    Input: filename - name of the file
    Output: ndarray
    """
    msa = AlignIO.read(filename,
                       'fasta',
                       alphabet=Alphabet.Gapped(Alphabet.IUPAC.protein))
    return np.array([list(rec) for rec in msa], np.character)
Exemplo n.º 12
0
 def test_to_alignment(self):
     tree = self.phyloxml.phylogenies[0]
     aln = tree.to_alignment()
     self.assertTrue(isinstance(aln, MultipleSeqAlignment))
     self.assertEqual(len(aln), 0)
     # Add sequences to the terminals
     alphabet = Alphabet.Gapped(Alphabet.generic_dna)
     for tip, seqstr in zip(tree.get_terminals(),
             ('AA--TTA', 'AA--TTG', 'AACCTTC')):
         tip.sequences.append(PX.Sequence.from_seqrecord(
             SeqRecord(Seq(seqstr, alphabet), id=str(tip))))
     # Check the alignment
     aln = tree.to_alignment()
     self.assertTrue(isinstance(aln, MultipleSeqAlignment))
     self.assertEqual(len(aln), 3)
     self.assertEqual(aln.get_alignment_length(), 7)
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha),
                                    sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
Exemplo n.º 14
0
def count_gaps_and_characters(aln_file, file_format = "fasta"):
    """
    count how many gaps and how many characters there are in an alignemnt
    :param aln_file: input alignment file
    :param file_format: input file format (default: fasta)
    :return: alignment length, number of gap chars, number of non-gap chars
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, file_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    total_gaps = 0
    total_not_gaps = 0
    for record in aln:
        local_gaps = record.seq.count("-")
        local_not_gaps = len(record.seq) - local_gaps
        total_gaps += local_gaps
        total_not_gaps += local_not_gaps
    return len(aln), total_gaps, total_not_gaps
Exemplo n.º 15
0
def get_major_and_minor_consensus(aln_file, in_format="fasta"):
    """
    calculates major and minor consensus and each position's probability
    - major consensus - the most prominent base (including "-")
    - minor consensus - the most prominent base (not including "-")
    :param aln_file: alignment file path
    :param in_format: input alignment format (default: fasta)
    :return: major_consensus, major_freqs, minor_consensus, minor_freqs
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    len_aln = len(aln[0])
    num_of_seq = len(aln)
    major_consensus = ""
    major_freqs = []
    minor_consensus = ""
    minor_freqs = []
    for i in range(len_aln):
        counter = collections.Counter(aln[:, i])
        major_count = 0
        minor_count = 0
        major_char = ""
        minor_char = ""
        for j in counter:
            if counter[j] > major_count:
                major_count = counter[j]
                major_char = j
                if j != "-":
                    minor_count = counter[j]
                    minor_char = j
            if counter[j] > minor_count and j != "-":
                if j not in ["A", "C", "G", "T"]:
                    minor_count = counter[j]
                    minor_char = "N"
                else:
                    minor_count = counter[j]
                    minor_char = j
        gap_count = counter["-"]
        major_consensus += major_char
        major_freqs.append(round(major_count / (num_of_seq - gap_count), 2))

        minor_consensus += minor_char
        minor_freqs.append(round(minor_count / (num_of_seq - gap_count), 2))

    return major_consensus, major_freqs, minor_consensus, minor_freqs
Exemplo n.º 16
0
def format_changer(filename, out_format, outfile= None, in_format="fasta"):
    """
    sequence file format changer
    :param filename: input sequence filename
    :param out_format: output format
    :param outfile: output file (default: None)
    :param in_format: input format (default: fasta)
    :return: out file path in out format
    """
    filename = check_filename(filename)
    if outfile != None:
        outfile = check_filename(outfile, Truefile=False)
    else:
        outfile = path.splitext(filename)[0] + "." + out_format
    alignment = AlignIO.read(filename, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    AlignIO.write(alignment, outfile, out_format)
    print("saved %s in format %s" % (outfile, out_format))
    return outfile
Exemplo n.º 17
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (Bio.Align.Generic)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict)+1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    fssp_align = Generic.Alignment(Alphabet.Gapped(
                                   Alphabet.IUPAC.extended_protein))
    for i in sorted(mult_align_dict):
        fssp_align.add_sequence(sum_dict[i].pdb2+sum_dict[i].chain2,
                                mult_align_dict[i])
#        fssp_align._add_numbering_table()
    return fssp_align
Exemplo n.º 18
0
def get_longest_sequence_name_in_fasta(aln_file, in_format="fasta"):
    """
    returns the longest sequence name in the alignment
    :param aln_file: input alignment file path
    :param in_format: input format (default = fasta)
    :return: name of the longest sequence in the alignment
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    longest = 0
    longest_name = ""
    for i in aln:
        seq = str(i.seq)
        seq = seq.replace("-", "")
        l = len(seq)
        if l > longest:
            longest = l
            longest_name = i.name
    return longest_name
Exemplo n.º 19
0
def cut_alignemnt_by_coordinates(aln_file, coor=[], perfix="cut", in_format="fasta"):
    """
    cuts alignment file by sequnce coordinate
    attention - the coordinates must be normelized to the specific alignment
    :param aln_file: input alignment file
    :param coor: input coordinates (default: [])
    :param perfix: perfix for output file (default: cut)
    :param in_format: input alignment formar (default: fasta)
    :return: output filename of cut alignment
    """
    if coor == []:
        raise Exception("no coordinates")
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    new_aln = aln[:, coor[0]:coor[1]]
    output = aln_file.split(".aln")[0] + "_%s.aln" % perfix
    AlignIO.write(new_aln, output, "fasta")
    print("wrote cut alignemnt in %s" % output)
    return output
Exemplo n.º 20
0
def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
    """Parse the given file into a clustal aligment object.
    
    Arguments:
    o file_name - The name of the file to parse.
    o alphabet - The type of alphabet to use for the alignment sequences.
    This should correspond to the type of information contained in the file.
    Defaults to be unambiguous_dna sequence.

    There is a deprecated optional argument debug_level which has no effect.

    Since Biopython 1.46, this has called Bio.AlignIO internally.
    """ 

    # Avoid code duplication by calling Bio.AlignIO to do this for us.
    handle = open(file_name, 'r')
    from Bio import AlignIO
    generic_alignment = AlignIO.read(handle, "clustal")
    handle.close()

    #Force this generic alignment into a ClustalAlignment... nasty hack
    if isinstance(alphabet, Alphabet.Gapped) :
        alpha = alphabet
    else :
        alpha = Alphabet.Gapped(alphabet)
    clustal_alignment = ClustalAlignment(alpha)
    clustal_alignment._records = generic_alignment._records
    for record in clustal_alignment._records :
        record.seq.alphabet = alpha

    try :
        clustal_alignment._version = generic_alignment._version
    except AttributeError :
        #Missing the version, could be a 3rd party tool's output
        pass

    try :       
        clustal_alignment._star_info = generic_alignment._star_info
    except AttributeError :
        #Missing the consensus, again, this is not always present
        pass

    return clustal_alignment
Exemplo n.º 21
0
def unalign(filename, in_format="fasta", gap = "-", outfile = None):
    """
    unaligns file
    :param filename: input alignment filename
    :param in_format: input format (default: fasta)
    :param gap: gap type (default: - )
    :return: out file path without gaps
    """
    filename = check_filename(filename)
    alignment = AlignIO.read(filename, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    for seq in alignment:
        seq.seq = seq.seq.ungap(gap)
    if outfile == None:
        outfile = path.splitext(filename)[0] + "-unaligned.fasta"
    else:
        outfile = check_filename(outfile, Truefile=None)
    SeqIO.write(alignment, outfile, "fasta")
    print("saved unaligned %s" % outfile)
    return outfile
Exemplo n.º 22
0
def get_consensus_from_alignment(aln_file, in_format="fasta"):
    """
    constructs a consensus sequence from alignment file
    :param aln_file: alignment file
    :param in_format: file format (default: fasta)
    :return: consensus sequence
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    len_aln = len(aln[0])
    consensus = ""
    for i in range(len_aln):
        count = 0
        max_char = ""
        counter = collections.Counter(aln[:, i])
        for j in counter:
            if counter[j] > count:
                count = counter[j]
                max_char = j
        if max_char == "-":
            continue
        consensus += max_char
    return consensus
Exemplo n.º 23
0
def get_consensus_percentage(aln_file, in_format="fasta"):
    """
    gets alignment file and returns the consensus and
    the percentage of each position in the alignment
    the percentage calculation ignores gaps
    :param aln_file: input alignment file path
    :param in_format: input file format (defualt: fasta)
    :return: consensus sequance and consensus percentage
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    len_aln = len(aln[0])
    num_of_seq = len(aln)
    consensus_percentage= {1:0, 0.9:0, 0.8:0, 0.7:0, 0.6:0, 0.5:0, 0.4:0, 0.3:0, 0.2:0}
    consensus = ""
    for i in range(len_aln):
        counter = collections.Counter(aln[:, i])
        count = 0
        max_char = ""
        for j in counter:
            if j == "-":
                continue
            elif counter[j] > count:
                count = counter[j]
                max_char = j
        if "-" not in counter:
            gap_count = 0
        else:
            gap_count = counter["-"]
        percentage = round(count/(num_of_seq-gap_count), 1)
        consensus_percentage[percentage] += 1
        consensus += max_char

    for n in consensus_percentage:
        consensus_percentage[n] = round(consensus_percentage[n] / len_aln, 3)
    return consensus, consensus_percentage
Exemplo n.º 24
0
 def setUp(self):
     self.dna = [
         Seq.Seq("ATCG", IUPAC.ambiguous_dna),
         Seq.Seq("gtca", Alphabet.generic_dna),
         Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
         Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")),
         "TGGTCA",
     ]
     self.rna = [
         Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
         Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
         Seq.Seq("uCAg", Alphabet.generic_rna),
         Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna,
                                                 "-")),
         Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")),
         "UGCAU",
     ]
     self.nuc = [
         Seq.Seq("ATCG", Alphabet.generic_nucleotide),
         "UUUTTTACG",
     ]
     self.protein = [
         Seq.Seq("ATCGPK", IUPAC.protein),
         Seq.Seq("atcGPK", Alphabet.generic_protein),
         Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
         Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
         Seq.Seq(
             "MEDG-KRXR*",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.MutableSeq(
             "ME-K-DRXR*XU",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         "TEDDF",
     ]
Exemplo n.º 25
0
 def get_alphabet(self):
     alph = self.alphabets.get(self.type, Alphabet.generic_alphabet)
     if self.mol_seq and self.mol_seq.is_aligned:
         return Alphabet.Gapped(alph)
     return alph
Exemplo n.º 26
0
    print repr(test_seq[1::3])
    print repr(test_seq[2::3])

    print "Setting wobble codon to N (set slice with stride 3):"
    test_seq[2::3] = "N" * len(test_seq[2::3])
    print repr(test_seq)

###########################################################################
print
print "Testing Seq addition"
print "===================="
dna = [
    Seq.Seq("ATCG", IUPAC.ambiguous_dna),
    Seq.Seq("gtca", Alphabet.generic_dna),
    Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
    Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), "TGGTCA"
]
rna = [
    Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
    Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
    Seq.Seq("uCAg", Alphabet.generic_rna),
    Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")),
    Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU"
]
nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG"]
protein = [
    Seq.Seq("ATCGPK", IUPAC.protein),
    Seq.Seq("atcGPK", Alphabet.generic_protein),
    Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
    Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
    Seq.Seq(
Exemplo n.º 27
0
 def get_alphabet(self):
     """Get the alphabet for the sequence."""
     alph = self.alphabets.get(self.type, Alphabet.generic_alphabet)
     if self.mol_seq and self.mol_seq.is_aligned:
         return Alphabet.Gapped(alph)
     return alph
Exemplo n.º 28
0
                            lineno,
                            file=None,
                            line=None):
    #TODO - Have Biopython DataLossWarning?
    if category in [UserWarning]:
        print "%s - %s" % (category.__name__, message)


warnings.showwarning = send_warnings_to_stdout

protein_alphas = [Alphabet.generic_protein]
dna_alphas = [Alphabet.generic_dna]
rna_alphas = [Alphabet.generic_rna]
nucleotide_alphas = [
    Alphabet.generic_nucleotide,
    Alphabet.Gapped(Alphabet.generic_nucleotide)
]
no_alpha_formats = [
    "fasta", "clustal", "phylip", "phylip-relaxed", "phylip-sequential", "tab",
    "ig", "stockholm", "emboss", "fastq", "fastq-solexa", "fastq-illumina",
    "qual"
]
possible_unknown_seq_formats = ["qual", "genbank", "gb", "embl", "imgt"]

#List of formats including alignment only file formats we can read AND write.
#The list is initially hard coded to preserve the original order of the unit
#test output, with any new formats added since appended to the end.
test_write_read_alignment_formats = [
    "fasta", "clustal", "phylip", "stockholm", "phylip-relaxed"
]
for format in sorted(SeqIO._FormatToWriter):
Exemplo n.º 29
0
    print consensus
    consensus = summary.gap_consensus(ambiguous="N")
    print consensus
    print
    print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus)
    print
    #Have a generic alphabet, without a declared gap char, so must tell
    #provide the frequencies and chars to ignore explicitly.
    print summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-'])
    print
    print "Trying a protein sequence with gaps and stops"

    alpha = Alphabet.HasStopCodon(
        Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print a
    print "=" * a.get_alignment_length()

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print c
    c = s.gap_consensus(ambiguous="X")
    print c
    print
    print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
Exemplo n.º 30
0
#!/usr/bin/env python
#coding: utf-8

from Bio import AlignIO, SeqIO, Align, Alphabet
import pandas as pd
import os, re, sys
from copy import deepcopy

aln_alphabet = Alphabet.Gapped(Alphabet.IUPAC.ambiguous_dna)

aln_folder    = '/work/abg_tree/concatenated_trees/3rd_try/alignments'
output_folder = '/work/abg_tree/concatenated_trees/3rd_try'
genomes    = {}
for aln in os.listdir(aln_folder):
    alignment    = AlignIO.read('%s/%s' %(aln_folder, aln), 'fasta')
    genomes[aln] = set()
    for entry in alignment:
        if re.match('GC[AF]_', entry.name):
            genome, gene = entry.name.split('|')
        else:
            genome, gene = entry.name.split('_')

        if genome in genomes[aln]:
            sys.exit('\t**Error, duplicated genome in %s: %s' %(aln, genome))

        genomes[aln].add(genome)

genome_union = set.union(*genomes.values())

missing_genes = {} # just to keep track of the number of missing marker genes in each genome
concatenation = {}