Пример #1
0
    def _concatenate(self, alignments):
        """Return single alignment from list of alignments for
multiple genes."""
        if len(alignments) == 1:
            return alignments[0]
        # sort IDs
        alignment_ids = []
        for gene in alignments:
            gene_ids = []
            for rec in gene:
                gene_ids.append(rec.id)
            alignment_ids.append(gene_ids)
        all_ids = []
        [all_ids.extend(e) for e in alignment_ids]
        all_ids = list(set(all_ids))
        # concatenate
        alignment = MultipleSeqAlignment([])
        for txid in all_ids:
            sequence = ""
            for i, gene in enumerate(alignments):
                if txid in alignment_ids[i]:
                    sequence += gene[alignment_ids[i].index(txid)].seq
                else:
                    sequence += "-" * gene.get_alignment_length()
            sequence = SeqRecord(sequence, id=txid,
                                 description="multigene sequence")
            alignment.append(sequence)
        return alignment
Пример #2
0
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        # TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len):
    """
    ---------------------------------------------------------------------
    MODIFIED FUNCTION FROM PHYLUCE: generic_align.py
    ---------------------------------------------------------------------
    First stage alignment trimming to find and trim edges of a given
    alignment.  Calls running_average function above to determine reasonable
    alignment start and end trimming for the entire alignment block.
    """
    start, end = running_average(alignment, window_size, proportion, threshold)
    s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
    for sequence in alignment:
        sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
        if start >= 0 and end:
            trim = sequence[start:end]
            if set(trim) != set(
                ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                s1_trimmed.append(sequence[start:end])
            else:
                s1_trimmed = None
                break
        else:
            s1_trimmed = None
            break

    return s1_trimmed
Пример #4
0
 def test_basic_alignment(self):
     """Basic tests on a simple alignment of three sequences."""
     alignment = MultipleSeqAlignment([])
     letters = "AbcDefGhiJklMnoPqrStuVwxYz"
     alignment.append(SeqRecord(Seq(letters), id="mixed"))
     alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
     alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
     self.assertEqual(alignment.get_alignment_length(), 26)
     self.assertEqual(len(alignment), 3)
     self.assertEqual(str(alignment[0].seq), letters)
     self.assertEqual(str(alignment[1].seq), letters.lower())
     self.assertEqual(str(alignment[2].seq), letters.upper())
     self.assertEqual(alignment[0].id, "mixed")
     self.assertEqual(alignment[1].id, "lower")
     self.assertEqual(alignment[2].id, "upper")
     for (col, letter) in enumerate(letters):
         self.assertEqual(alignment[:, col],
                          letter + letter.lower() + letter.upper())
     # Check row extractions:
     self.assertEqual(alignment[0].id, "mixed")
     self.assertEqual(alignment[-1].id, "upper")
     # Check sub-alignment extraction by row slicing:
     self.assertIsInstance(alignment[::-1], MultipleSeqAlignment)
     self.assertEqual(alignment[::-1][0].id, "upper")
     self.assertEqual(alignment[::-1][2].id, "mixed")
Пример #5
0
 def trim_seqs_to_ref(self):
     """
     Trim the requested sequences to the reference length in the alignment.
     """
     temp_aln = MultipleSeqAlignment([])
     for seq in self.alignment:
         if seq.id in self.trim_seqs and self.trim_seqs:
             sequence = MutableSeq(str(seq.seq))
             if self.boundary[0] > 0:
                 sequence[0:self.boundary[0]] = self.gap_char * (
                     self.boundary[0] - 0)
             if self.boundary[1] < len(sequence):
                 sequence[self.boundary[1]:] = self.gap_char * (
                     len(sequence) - self.boundary[1])
             seq.seq = sequence
             if set(seq.seq) == set({self.gap_char}):
                 print(
                     f"{seq.id} contains only gaps after trimming. "
                     f"Removing {seq.id} from alignment.",
                     file=sys.stderr)
             else:
                 temp_aln.append(seq)
         else:
             temp_aln.append(seq)
     self.alignment = temp_aln
Пример #6
0
def removecolumnfrommask(seqfile, filetype, mask):
    outFile = open(seqfile.split('.')[0] + '_masked.fas', 'w+')
    alignment = AlignIO.read(seqfile, filetype)
    trimAlign = MultipleSeqAlignment([])
    numCol = alignment.get_alignment_length()
    colToKeep = []
    coltoremove = []

    for k in open(mask, 'r'):
        coltoremove.append(int(k.split('\n')[0]))
    print(len(coltoremove))

    for i in range(numCol):
        if i not in coltoremove:
            colToKeep.append(i)
    print(len(colToKeep))
    print('if okay remove+keep (', int(len(coltoremove) + len(colToKeep)),
          ') match ', int(numCol))
    for record in alignment:
        newseq = ""
        for j in colToKeep:
            newseq = newseq + (record[j])

        newRecord = SeqRecord(Seq(newseq), id=record.id)
        trimAlign.append(newRecord)
        if 'SWARM' in record.id:
            outFile.write('>' + record.id.split('_')[0] + '\n' + newseq + '\n')
        else:
            outFile.write('>' + record.id + '\n' + newseq + '\n')
    outFile.close()
    print("Total number of columns remaining: %i" %
          trimAlign.get_alignment_length())
Пример #7
0
    def getOptimalQuartets(self, quartets):
        optimal_quartets = dict()
        for quartet in quartets:
            quartet_id = self.getQuartetID(quartet)
            assert quartet_id not in optimal_quartets

            trees = {tuple(quartet): self.treeFromQuartet(quartet)}
            for i in range(0, 2):
                temp = quartet[i]
                quartet[i] = quartet[2]
                quartet[2] = temp
                trees[tuple(quartet)] = self.treeFromQuartet(quartet)

            min_cost = float("inf")
            for quartet_key, tree in trees.iteritems():
                alignment = MultipleSeqAlignment([])
                for record in self._alignment:
                    if record.id in quartet:
                        alignment.append(record)
                small_parsimony = SmallParsimony(tree, alignment)
                if small_parsimony.cost < min_cost:
                    min_cost = small_parsimony.cost
                    optimal_quartets[quartet_id] = {
                        "topology": quartet_key,
                        "topology_id": self.getTopologyID(quartet_key)
                    }
        return optimal_quartets
Пример #8
0
 def stage_one_trimming(self, alignment, window_size, threshold,
                        proportion):
     """
     First stage (of 3) alignment trimming to find and trim edges of a given
     alignment.  Calls running_average function above to determine reasonable
     alignment start and end trimming for the entire alignment block.
     """
     # get the trim positions that we determine begin and end "good"
     # alignments
     start, end = self.running_average(alignment, window_size, threshold,
                                       proportion)
     # create a new alignment object to hold our alignment
     s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-"))
     for sequence in alignment:
         if start >= 0 and end:
             trim = sequence[start:end]
             # ensure we don't just add a taxon with only gaps/missing
             # data
             if set(trim) != set(['-']) and set(trim) != (['?']):
                 s1_trimmed.append(sequence[start:end])
             else:
                 s1_trimmed = None
                 break
         else:
             s1_trimmed = None
             break
     return s1_trimmed
Пример #9
0
def load_weighted_msa(work_msa):
    """
    The given multiple sequence alignment (MSA) should contain the reference 
    sequence.  The reference will be removed from the alignment and returned 
    separately.  The alignment will also be changed such that "." is used to 
    indicate terminal deletions while "-" is used to indicate internal 
    deletions.
    """
    msa_with_ref = AlignIO.read(work_msa.output_aln, 'clustal')

    ref = None
    msa = MultipleSeqAlignment([])

    for record in msa_with_ref:
        # Use "." to indicate terminal mismatches, and "-" to indicate internal
        # mismatches.

        to_dots = lambda m: '.' * (m.end() - m.start())
        record.seq = Seq(
            re.sub('^-+|-+$', to_dots, str(record.seq)),
            record.seq.alphabet,
        )

        if record.id == work_msa.shared.target_id:
            ref = record
        else:
            msa.append(record)

    msa.ref = ref
    msa.ref_ungapped = remove_gaps(ref.seq)

    weight_alignments(msa)

    return msa
Пример #10
0
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType):
	''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.'''
	
	new='?'	
	parsed = AlignIO.read(refMSA_file, 'fasta')
	newseqs=[]
	numres=0
	totalmasked=0
	maskedMSA=MultipleSeqAlignment([])
	for row in range(numseq):
		newseq=''
		for position in range(alnlen):
			thispos=str(parsed[row].seq[position])
			if thispos=='-':
				newseq=newseq+parsed[row].seq[position]
			else:
				numres+=1
				thescore=scores[row][position]
				if float(thescore)<float(x): #mask if below threshold. 					
					newseq=newseq+new
					totalmasked+=1
				else: #or, keep that position
					newseq=newseq+parsed[row].seq[position]
		newseqs.append(newseq)
	
	for i in range(numseq):
		if str(seqType)=='protein':
			aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='')
		elif str(seqType)=='dna':
			aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='')
		maskedMSA.append(aln_record)

	outhandle=open(final_file, 'w')
	outhandle.write(maskedMSA.format(str(formatout)))
	outhandle.close()
Пример #11
0
def pad_nucleotide_sequences(aln_aa, seq_nuc):
    '''
    introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences.

    Parameters:
    - aln_aa: amino acid alignment
    - seq_nuc: unaligned nucleotide sequences.

    Returns:
    - aligned nucleotide sequences with all gaps length 3
    '''
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    aln_nuc = MultipleSeqAlignment([])
    for aa_seq  in aln_aa:
        try:
            tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq)
        except KeyError as e:
            print aa_seq.id
            print 'Key not found, continue with next sequence'
            continue

        tmpseq = ''
        nuc_pos = 0
        for aa in aa_seq:
            if aa=='-':
                tmpseq+='---'
            else:
                tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)]
                nuc_pos+=3

        aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id))

    return aln_nuc
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input)
    all_taxa = set([])
    for count, f in enumerate(files):
        #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        new_align = MultipleSeqAlignment([], generic_dna)
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                #pdb.set_trace()
                fname = os.path.splitext(os.path.basename(f))[0]
                new_seq_name = re.sub("^{}_*".format(fname), "", seq.name)
                all_taxa.add(new_seq_name)
                seq.id = new_seq_name
                seq.name = new_seq_name
                new_align.append(seq)
        assert len(all_taxa) == args.taxa, "Taxon names are not identical"
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
    print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
Пример #13
0
    def testLimit(self, list_seqs, start):
        """
        Extract the aa sequences in the window.
        list_seqs is the list of sequence id in the alignment (not the id associated
            with the Bio.Seq object).
        start is the index of the start of the window.
        """
        frame = start % 3
        aa_window_length = int(self.window_length / 3)
        begin = int((start - frame) / 3)
        end = int(begin + aa_window_length)

        if frame == 0:
            t_align = self.t_align0
        elif frame == 1:
            t_align = self.t_align1
        else:
            t_align = self.t_align2

        sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
        for idx in list_seqs:
            sub_align.append(t_align[idx][begin:end])

        result = []

        for c in range(aa_window_length):
            c = Counter(sub_align[:, c])
            #  count the most common aa
            nbr_most_common = c.most_common(1)[0][1]
            if nbr_most_common / len(list_seqs) >= self.min_aa_ratio:
                result.append(True)
            else:
                result.append(False)
        return result
Пример #14
0
def clean_seqs(gene):
    '''clean up sequences to remove N & - characters'''
    clean_gene = MultipleSeqAlignment([])
    for genome in gene:
        if genome.seq.count("N") + genome.seq.count("-") < 0.1 * (len(
                genome.seq)):
            clean_gene.append(genome)
    return clean_gene
Пример #15
0
 def stage_two_trimming(self, s1_trimmed, window_size=5):
     """
     Alignment row-by-row trimming.  After stage one trimming, iterate
     over rows of alignment to find differences between the alignment
     consensus and the row of data.  Trim those ends coming before
     (or after at 3' end) a block of 5 contiguous highly conserved
     positions.  Goes to third round of filtering to remove edges that
     end up with only '----' characters to start or end alignment block.
     """
     # create new alignment object to hold trimmed alignment
     s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna,
                                                  "-?"))
     # get consensus of alignment in array form
     consensus_array = numpy.array(
         list(self._alignment_consensus(s1_trimmed)))
     # iterate over each alignment sequence
     for sequence in s1_trimmed:
         #if sequence.id == 'phaenicophaeus_curvirostris2':
         #    pdb.set_trace()
         start, end = self._get_ends(sequence)
         # convert sequence to array
         orig_seq_array = numpy.array(list(sequence))
         # trim down edge gaps so they do not exert undue influence
         # on the running average
         seq_array = orig_seq_array[start:end]
         compare = (seq_array == consensus_array[start:end])
         weight = numpy.repeat(1.0, window_size) / window_size
         # compute running average across window size
         running_average = numpy.convolve(compare, weight, 'same')
         # get first 5' and 3' positions where quality > 1 over
         # 5 positions ([True, True, True, True, True]). This helps
         # us find the ends of the alignment where there are likely
         # problems)
         gm = (running_average > 0.99)
         for i in xrange(gm.size):
             # get 5 value slices
             if numpy.all(gm[i:i + 5] == True):
                 bad_start = i
                 break
         reversed_gm = gm[::-1]
         for i in xrange(reversed_gm.size):
             # get 5 value slices
             if numpy.all(reversed_gm[i:i + 5] == True):
                 bad_end = reversed_gm.size - i
                 break
         orig_seq_array[:start + bad_start] = '-'
         orig_seq_array[start + bad_end:] = '-'
         trim = ''.join(orig_seq_array)
         # feed those up to replacement engine to set all
         # missing/trimmed data at edges to "?" which is
         # missing data designator
         #trim = self._replace_ends(trim)
         if set(trim) != set(['-']) and set(trim) != (['?']):
             s2_trimmed.append(self._record_formatter(trim, sequence.id))
         else:
             s2_trimmed = None
             break
     return s2_trimmed
Пример #16
0
def gap_span(reads, bases):
    '''
    Returns a MSA with rows=reads and columns=bases, composed of gaps only
    '''
    spal = MultipleSeqAlignment(alphabet)
    span = ''.join('-' * bases)
    for r in reads:
        spal.append(Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(span, alphabet), id=r))
    return spal
def remove_seed_duplicates(msa, seed_index):
    sequence = str(msa[seed_index].seq)
    return_msa = MultipleSeqAlignment([])
    for (index, seq_record) in enumerate(msa):
        if (index == seed_index):
            return_msa.append(seq_record)
        else:
            if (str(seq_record.seq) != sequence):
                return_msa.append(seq_record)
    return return_msa
Пример #18
0
def bam2Alignment(sam_name, chrom=None, start=None, stop=None, minlen=1):
    """
    Read alignment from samfile and return Alignment object.
    """
    it = sam_name.fetch(chrom, start, stop)
    aln = MultipleSeqAlignment(alphabet)
    for read in it:
        if read.rlen - start + read.pos + 1 > minlen and stop - read.pos + 1 >= minlen:
            aln.append(getSeqRecord(read, start=start, stop=stop))

    return aln
def replace_gaps(aln):
    """we need to determine actual starts of alignments"""
    new_aln = MultipleSeqAlignment([], generic_dna)
    for taxon in aln:
        seq = replace_gaps_at_start_and_ends(taxon.seq)
        new_aln.append(
            SeqRecord(seq,
                      id=taxon.id,
                      name=taxon.name,
                      description=taxon.description))
    return new_aln
Пример #20
0
def json_to_Bio_alignment(seq_json):
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    aln = MultipleSeqAlignment([])
    for seq in seq_json:
        aln.append(
            SeqRecord(name=seq['strain'],
                      id=seq['strain'],
                      seq=Seq(seq['seq'])))
    return aln
Пример #21
0
def refactor_title_allmsa(msa):
    """
    refactors titles of sequence in format needed for histoneDB seeds
    """
    msa_r = MultipleSeqAlignment([])
    for i in msa:
        print(i.description)
        # genus=re.search(r"\[(\S+)\s+.+\S+\]",i.description).group(1)
        text = re.search(r"(\S+)\|(\d+)\|(\S+)", i.id)
        i.id = text.group(3) + "|" + text.group(1) + "|" + text.group(2)
        # i.description=genus+"_"+variant+"_"+gi
        msa_r.append(i)
    return msa_r
Пример #22
0
def aln_undup(alignment):
    """Removes duplicate keys"""
    aln = MultipleSeqAlignment([])
    checksums = set()
    for record in alignment:
        checksum = seguid(record.seq)
        if checksum in checksums:
            print "Ignoring %s" % record.id
            continue
        checksums.add(checksum)
        aln.append(record)

    return aln
Пример #23
0
def maskalignment(arg, percent, percentmissing, filetype):
    maskedcolumn = open(
        arg.split('.')[0] + '_mask_' + str(percentmissing) + '.txt', 'w+')
    outFile = open(
        arg.split('.')[0] + '_masked_' + str(percentmissing) + '.fas', 'w+')
    checkgap = open(arg.split('.')[0] + '_missingcharacter.txt', 'w+')
    alignment = AlignIO.read(arg, filetype)
    trimAlign = MultipleSeqAlignment([])
    numRows = len(alignment)
    x = float(percent) * float(numRows) / 100.0
    numGap = numRows - float(x)
    numCol = alignment.get_alignment_length()

    print("Total number of rows: %i" % numRows)
    print("Number of gapped sequences allowed at a given site: %i" % numGap)
    print("Total number of columns: %i" % numCol)
    checkgap.write("Total number of rows: \t" + str(numRows) +
                   '\nNumber of gapped sequences allowed at a given site: \t' +
                   str(numGap) + '\n Total number of columns: \t' +
                   str(numCol) + '\n\n cutoff : \t' + str(x) + '\n\n\n')
    checkgap.write("Position \t Missing Characters \t Characters \n")
    my_array = {}
    colToKeep = []
    for i in range(numCol):
        #print i
        lineName = "line_" + str(i)
        my_array[lineName] = alignment[:, i]
        chapre = int(numRows) - int(my_array[lineName].count('-'))
        checkgap.write(
            str(i) + '\t' + str(my_array[lineName].count('-')) + '\t' +
            str(chapre) + '\n')
        if my_array[lineName].count('-') > numGap:
            print("get rid of column %i" % i)
            maskedcolumn.write(str(i) + '\n')
        else:
            colToKeep.append(i)

    for record in alignment:
        newseq = ""
        for i in colToKeep:
            newseq = newseq + (record[i])

        newRecord = SeqRecord(Seq(newseq), id=record.id)
        trimAlign.append(newRecord)
        outFile.write('>' + record.id + '\n' + newseq + '\n')

    print("Total number of columns remaining: %i" %
          trimAlign.get_alignment_length())
Пример #24
0
def mult_align(sum_dict, align_dict):
    """Return multiple alignment instance (MultipleSeqAlignment)."""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ""

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    fssp_align = MultipleSeqAlignment([])
    for i in sorted(mult_align_dict):
        fssp_align.append(
            SeqRecord(Seq(mult_align_dict[i]),
                      sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
Пример #25
0
def refactor_title(msa, variant):
    """
    refactors titles of sequence in format needed for histoneDB seeds
    """
    msa_r = MultipleSeqAlignment([])
    for i in msa:
        # print i.description
        gi = re.search(r"gi\|(\d+)\|", i.id).group(1)
        try:
            genus = re.search(r"\[(\S+)\s+.+\S+\]", i.description).group(1)
        except:
            genus = get_genus_by_gi(gi)

        i.id = genus + "|" + gi + "|" + variant
        i.description = genus + "_" + variant + "_" + gi
        msa_r.append(i)
    return msa_r
Пример #26
0
 def translate(self, align, offset):
     """
     Translate the alignment according to the selected frame which is set 
         according to 'offset' value
     """
     end = ((align.get_alignment_length() - offset) // 3) * 3 + offset
     t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
     for rec in align:
         seq = str(rec.seq).upper().replace("-", "N").replace("n", "N")
         new_seq = Seq(seq,
                       IUPAC.IUPACAmbiguousDNA())[offset:end].translate()
         new_rec = SeqRecord(new_seq,
                             name=rec.name,
                             id=rec.id,
                             description="")
         t_align.append(new_rec)
     return t_align
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha),
                                    sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
def stage_two_trimming(s1_trimmed, window_size, max_divergence, min_len):
    """
    ---------------------------------------------------------------------
    MODIFIED FUNCTION FROM PHYLUCE: generic_align.py
    ---------------------------------------------------------------------
    Alignment row-by-row trimming.  After stage one trimming, iterate
    over rows of alignment to find differences between the alignment
    consensus and the row (taxon) of data.  Trim those ends that differ
    from the consensus with > `divergence` across a `window_size` window.
    Goes to third round of filtering to remove edges that end up with only '----'
    characters to start or end alignment block.
    """
    s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
    consensus_array = numpy.array(list(alignment_consensus(s1_trimmed)))
    for sequence in s1_trimmed:
        sequence = sequence.upper()
        start, end = get_ends(sequence)
        orig_seq_array = numpy.array(list(sequence))
        seq_array = orig_seq_array[start:end]
        bad_start = 0
        bad_end = len(sequence)
        compare = (seq_array != consensus_array[start:end])
        for bad_start in range(compare.size):
            window = compare[bad_start:bad_start + window_size]
            divergence = float(sum(window)) / window.size
            if divergence < max_divergence:
                break
        reversed_compare = compare[::-1]
        for bad_end in range(reversed_compare.size):
            window = reversed_compare[bad_end:bad_end + window_size]
            divergence = float(sum(window)) / window.size
            if divergence < max_divergence:
                bad_end = reversed_compare.size - bad_end
                break
        orig_seq_array[:start + bad_start] = '-'
        orig_seq_array[start + bad_end:] = '-'
        trim = ''.join(orig_seq_array)
        if set(trim) != set(
            ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
            s2_trimmed.append(record_formatter(trim, sequence.id))
        else:
            s2_trimmed = None
            break

    return s2_trimmed
Пример #29
0
def concatenate_msa(out_dir):
    with open(os.path.join(out_dir, 'supermatrix-msa.phy'), 'w') as fh:
        # set the order of the taxa based on the first MSA file
        taxa = {}
        for record in SeqIO.parse(os.path.join(out_dir, 'msa-0.fasta'),
                                  'fasta'):
            taxa[record.id] = Seq('', generic_dna)
        # get each MSA file and concatenate it to the supermatrix
        for msa_file in glob.glob(os.path.join(out_dir, 'msa-*.fasta')):
            alignment = SeqIO.to_dict(SeqIO.parse(msa_file, 'fasta'))
            for taxon, seq in taxa.items():
                taxa[taxon] += alignment[taxon].seq
                #seq.append(str(alignment[taxon].seq))
        # write the supermatrix to the file
        msa = MultipleSeqAlignment([], alphabet=generic_dna)
        for taxon, seq in taxa.items():
            msa.append(SeqRecord(seq, id=taxon))
        AlignIO.write(msa, fh, 'phylip')
Пример #30
0
 def stage_one_trimming(self,
                        alignment,
                        window_size,
                        proportion,
                        threshold,
                        min_len,
                        replace_ends=False):
     """
     First stage (of 3) alignment trimming to find and trim edges of a given
     alignment.  Calls running_average function above to determine reasonable
     alignment start and end trimming for the entire alignment block.
     """
     # get the trim positions that we determine begin and end "good"
     # alignments
     start, end = self.running_average(alignment, window_size, proportion,
                                       threshold)
     # create a new alignment object to hold our alignment
     s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna,
                                                  "-?"))
     for sequence in alignment:
         # ensure correct sequence alphabet or we'll get a conflict when
         # we try to generate a consensus
         sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
         if start >= 0 and end:
             trim = sequence[start:end]
             # ensure we don't just add a taxon with only gaps/missing
             # data and that alignments are >= min_len
             if set(trim) != set(
                 ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                 if not replace_ends:
                     s1_trimmed.append(sequence[start:end])
                 else:
                     # replace end gaps with missing data character ?
                     # called on third iteration of trimming
                     repl = self._replace_ends(str(sequence[start:end].seq))
                     s1_trimmed.append(
                         self._record_formatter(repl, sequence.id))
             else:
                 s1_trimmed = None
                 break
         else:
             s1_trimmed = None
             break
     return s1_trimmed