Пример #1
0
def filter_phyloxml(tree, species_list) :
    msa = MultipleSeqAlignment([])
    flies = set()
    
    #print >> sys.stderr, "  filtering phyloxml to fly species..."

    for node in tree.get_terminals() :
        include = False

        for prop in node.properties :
            if (prop.ref == 'Compara:genome_db_name') and (prop.value in species_list) :
                flies.add(prop.value)
                include = True
                break

        if include :
            assert len(node.sequences) == 1
            sqrcd = node.sequences[0].to_seqrecord()
            sqrcd.id = node.name
            sqrcd.description = ""
            msa.append(sqrcd)

        if not include :
            tree.prune(node)

    return tuple(flies), remove_gap_columns(msa), tree
Пример #2
0
 def stage_one_trimming(self, alignment, window_size, threshold, proportion):
     """
     First stage (of 3) alignment trimming to find and trim edges of a given
     alignment.  Calls running_average function above to determine reasonable
     alignment start and end trimming for the entire alignment block.
     """
     # get the trim positions that we determine begin and end "good"
     # alignments
     start, end = self.running_average(alignment, window_size, threshold, proportion)
     # create a new alignment object to hold our alignment
     s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-"))
     for sequence in alignment:
         if start >= 0 and end:
             trim = sequence[start:end]
             # ensure we don't just add a taxon with only gaps/missing
             # data
             if set(trim) != set(['-']) and set(trim) != (['?']):
                 s1_trimmed.append(sequence[start:end])
             else:
                 s1_trimmed = None
                 break
         else:
             s1_trimmed = None
             break
     return s1_trimmed
Пример #3
0
def clean_seqs(gene):
	'''clean up sequences to remove N & - characters'''
	clean_gene = MultipleSeqAlignment([])
	for genome in gene:
		if genome.seq.count("N") + genome.seq.count("-") <  0.1*(len(genome.seq)):
			clean_gene.append(genome)
	return clean_gene
Пример #4
0
	def maskalignment(self,arg, percent,filetype):
	
		name = arg[0:10]
		maskFileName =  self.PathtoOutput + '/Guidance/GuidanceOutput/' + name + '_masked_' + str(percent) + '.fas'
		outFile = open(maskFileName,'w')
		alignment = AlignIO.read(self.PathtoOutput + '/Guidance/GuidanceOutput/' + arg, filetype)
		trimAlign = MultipleSeqAlignment([])
		numRows = len(alignment)
		x = float(percent) * float(numRows) / 100.0
		numGap = numRows - int(x)
		numCol = alignment.get_alignment_length()
	
		#print "Total number of rows: %i" % numRows
		#print "Number of gapped sequences allowed at a given site: %i" % numGap
		#print "Total number of columns: %i" % numCol
		my_array = {}
		colToKeep=[]
		for i in range(numCol):
			#print i
			lineName = "line_" + str(i)
			my_array[lineName] = alignment[:,i]
			if my_array[lineName].count('-') > numGap:
				print "get rid of column %i" % i
			else:
				colToKeep.append(i)
		
		for record in alignment:
			newseq = ""
			for i in colToKeep:
				newseq= newseq + (record[i])
				
			newRecord = SeqRecord(Seq(newseq), id=record.id)
			trimAlign.append(newRecord)
			outFile.write('>' + record.id + '\n' + newseq + '\n')
Пример #5
0
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename

	
	outdir = sys.argv[3] 					#Output directory
	if os.path.isdir(outdir):				#Checks the presence of directory
		print "Directory exists. New directory not created"
	else: 
		command= "mkdir "+ outdir 
		os.system(command)
								#outpath defines path of the subfolder we want to store results in 
			        
	outpath = outdir + '/' + sys.argv[1]
	command = "mkdir " + outpath
	os.system(command)

								#write the result to output
        align = MultipleSeqAlignment([])
	output_file = outpath + '/' + filename + '.' + 'output'
	#print output_file
								#path = outdir + '/'+ output_file
        
	for i in range(len(seqs)):
                align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i]))
                
        AlignIO.write(align, output_file ,"fasta")
Пример #6
0
    def _concatenate(self, alignments):
        """Return single alignment from list of alignments for
multiple genes."""
        if len(alignments) == 1:
            return alignments[0]
        # sort IDs
        alignment_ids = []
        for gene in alignments:
            gene_ids = []
            for rec in gene:
                gene_ids.append(rec.id)
            alignment_ids.append(gene_ids)
        all_ids = []
        [all_ids.extend(e) for e in alignment_ids]
        all_ids = list(set(all_ids))
        # concatenate
        alignment = MultipleSeqAlignment([])
        for txid in all_ids:
            sequence = ""
            for i, gene in enumerate(alignments):
                if txid in alignment_ids[i]:
                    sequence += gene[alignment_ids[i].index(txid)].seq
                else:
                    sequence += "-" * gene.get_alignment_length()
            sequence = SeqRecord(sequence, id=txid,
                                 description="multigene sequence")
            alignment.append(sequence)
        return alignment
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input)
    all_taxa = set([])
    for count, f in enumerate(files):
        #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        new_align = MultipleSeqAlignment([], generic_dna)
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                #pdb.set_trace()
                fname = os.path.splitext(os.path.basename(f))[0]
                new_seq_name = re.sub("^{}_*".format(fname), "", seq.name)
                all_taxa.add(new_seq_name)
                seq.id = new_seq_name
                seq.name = new_seq_name
                new_align.append(seq)
        assert len(all_taxa) == args.taxa, "Taxon names are not identical"
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
    print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
Пример #8
0
 def _prepResults(self):
     #print "Query", self.state['queryName'], self.state['queryLen']
     #print "HitINFO:", self.hitInfo
     #print "HitRecord", self.hitRecord
     #print "HitAlign:", self.alignMap        
     if ( len(self.hitInfo) == 0 and not self.state['eof'] ):
         alignment = MultipleSeqAlignment( [], self.alphabet)
         self.outList.append( alignment )
     
     for hit in self.hitInfo:
         for domain in self.hitRecord[ hit ]:
             queryStr  = "".join( self.alignMap[ hit ][ domain ][ 'query' ] )
             targetStr = "".join( self.alignMap[ hit ][ domain ][ 'target' ] )
             
             query = SeqRecord(Seq(queryStr, self.alphabet),
                 id = self.state['queryName'], description = self.state.get( 'desc', "" ),
                 annotations = {})
                 
             target = SeqRecord(Seq(targetStr, self.alphabet),
                 id = hit,
                 annotations = {})
             alignment = HMMERAlign( [query,target], self.alphabet)
             alignment._annotations = self.hitRecord[ hit ][ domain ]
             alignment._annotations[ 'seqName' ] = self.state['queryName']
             alignment._annotations[ 'hmmName' ] = hit
             
             
             self.outList.append( alignment )
Пример #9
0
def pad_nucleotide_sequences(aln_aa, seq_nuc):
    '''
    introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences.

    Parameters:
    - aln_aa: amino acid alignment
    - seq_nuc: unaligned nucleotide sequences.

    Returns:
    - aligned nucleotide sequences with all gaps length 3
    '''
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    aln_nuc = MultipleSeqAlignment([])
    for aa_seq  in aln_aa:
        try:
            tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq)
        except KeyError as e:
            print aa_seq.id
            print 'Key not found, continue with next sequence'
            continue

        tmpseq = ''
        nuc_pos = 0
        for aa in aa_seq:
            if aa=='-':
                tmpseq+='---'
            else:
                tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)]
                nuc_pos+=3

        aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id))

    return aln_nuc
Пример #10
0
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType):
	''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.'''
	
	new='?'	
	parsed = AlignIO.read(refMSA_file, 'fasta')
	newseqs=[]
	numres=0
	totalmasked=0
	maskedMSA=MultipleSeqAlignment([])
	for row in range(numseq):
		newseq=''
		for position in range(alnlen):
			thispos=str(parsed[row].seq[position])
			if thispos=='-':
				newseq=newseq+parsed[row].seq[position]
			else:
				numres+=1
				thescore=scores[row][position]
				if float(thescore)<float(x): #mask if below threshold. 					
					newseq=newseq+new
					totalmasked+=1
				else: #or, keep that position
					newseq=newseq+parsed[row].seq[position]
		newseqs.append(newseq)
	
	for i in range(numseq):
		if str(seqType)=='protein':
			aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='')
		elif str(seqType)=='dna':
			aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='')
		maskedMSA.append(aln_record)

	outhandle=open(final_file, 'w')
	outhandle.write(maskedMSA.format(str(formatout)))
	outhandle.close()
Пример #11
0
def NexusIterator(handle, seq_count=None):
    """Returns SeqRecord objects from a Nexus file.

    Thus uses the Bio.Nexus module to do the hard work.

    You are expected to call this function via Bio.SeqIO or Bio.AlignIO
    (and not use it directly).

    NOTE - We only expect ONE alignment matrix per Nexus file,
    meaning this iterator will only yield one MultipleSeqAlignment.
    """
    n = Nexus.Nexus(handle)
    if not n.matrix:
        #No alignment found
        raise StopIteration
    alignment = MultipleSeqAlignment(n.alphabet)

    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
    #The original names and the modified names are kept in these two lists:
    assert len(n.unaltered_taxlabels) == len(n.taxlabels)
    
    if seq_count and seq_count != len(n.unaltered_taxlabels):
        raise ValueError("Found %i sequences, but seq_count=%i" \
               % (len(n.unaltered_taxlabels), seq_count))
        
    for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels):
        assert new_name.startswith(old_name)
        seq = n.matrix[new_name] #already a Seq object with the alphabet set
        #ToDo - Can we extract any annotation too?
        alignment.append(SeqRecord(seq, id=new_name, name=old_name,
                                   description=""))
    #All done
    yield alignment
Пример #12
0
def main():

    params = parseArguments(version)

    for filename in params['filenames']:
        alignment, info = filterBlocks(filename, params)
        metadata = calculateMetadata(alignment, info)

        initialJson = ''
        if params['H']:
            # Generate JSON for HTML output, add the "valid blocks" sequence to the initial one.
            validSeq = Seq(metadata['validString'])
            validSeqRecord = SeqRecord(seq = validSeq, id = 'Valid blocks', name = 'Valid Blocks')
            scoreSeq = Seq(metadata['scoreString'])
            scoreSeqRecord = SeqRecord(seq = scoreSeq, id = 'Score', name = 'Heterozygosity Score')
            jsonAlignment = MultipleSeqAlignment([validSeqRecord, scoreSeqRecord])
            for record in alignment:
                jsonAlignment.append(record)
            initialJson = getInitialJson(jsonAlignment)

        if params['debug']: printAlign(alignment, info)

        blocks = metadata['blocks']

        outfile = filename.split('/')[-1].split('.')[:-1]
        outfile = '.'.join(outfile) + '-out'

        writeAlign(alignment, metadata, outfile, initialJson, info, params)
Пример #13
0
 def stage_one_trimming(self, alignment, window_size, proportion, threshold, min_len, replace_ends=False):
     """
     First stage (of 3) alignment trimming to find and trim edges of a given
     alignment.  Calls running_average function above to determine reasonable
     alignment start and end trimming for the entire alignment block.
     """
     # get the trim positions that we determine begin and end "good"
     # alignments
     start, end = self.running_average(alignment, window_size, proportion, threshold)
     # create a new alignment object to hold our alignment
     s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
     for sequence in alignment:
         # ensure correct sequence alphabet or we'll get a conflict when
         # we try to generate a consensus
         sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
         if start >= 0 and end:
             trim = sequence[start:end]
             # ensure we don't just add a taxon with only gaps/missing
             # data and that alignments are >= min_len
             if set(trim) != set(['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                 if not replace_ends:
                     s1_trimmed.append(sequence[start:end])
                 else:
                     # replace end gaps with missing data character ?
                     # called on third iteration of trimming
                     repl = self._replace_ends(str(sequence[start:end].seq))
                     s1_trimmed.append(self._record_formatter(repl, sequence.id))
             else:
                 s1_trimmed = None
                 break
         else:
             s1_trimmed = None
             break
     return s1_trimmed
def add_gaps_to_align(aln, organisms, check_missing, missing, verbatim=False, min_taxa=3):
    local_organisms = copy.deepcopy(organisms)
    if len(aln) < min_taxa:
        new_align = None
    elif len(aln) >= min_taxa:
        new_align = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
        overall_length = len(aln[0])
        for seq in aln:
            # strip any reversal characters from mafft
            seq.name = seq.name.lstrip('_R_')
            if not verbatim:
                new_seq_name = '_'.join(seq.name.split('_')[1:])
            else:
                new_seq_name = seq.name.lower()
            new_align.append(record_formatter(str(seq.seq), new_seq_name))
            local_organisms.remove(new_seq_name)
        for org in local_organisms:
            if not verbatim:
                loc = '_'.join(seq.name.split('_')[:1])
            else:
                loc = seq.name
            if check_missing and missing:
                try:
                    assert loc in missing[org], "Locus missing"
                except:
                    assert loc in missing['{}*'.format(org)], "Locus missing"
            missing_string = '?' * overall_length
            new_align.append(record_formatter(missing_string, org))
    return new_align
Пример #15
0
def propmat(alignment, num_imp, num_changes, transitions, probs):
	num_changes = int(num_changes)
	orlen = len(alignment)-num_imp
	record = 0
	newpd = copy(alignment.pd)
	newdistarray = copy(alignment.distarray)
	origlist = [i for i in alignment[:orlen]]
	implist = [i for i in alignment[orlen:]]
#	targets = [(random.randint(0,len(implist)-1), random.randint(0,len(alignment[0])-1)) for i in xrange(num_changes)]
	targets = [(random.randint(0,len(implist)-1), wl_one(probs)) for i in xrange(num_changes)]
	for t in targets:
		old = newdistarray[orlen+t[0],t[1]]
		new = weightselect(transitions[old])
#		new = random.choice(AAS)
		newdistarray[orlen+t[0],t[1]] = new
		changes = (newdistarray[:,t[1]]==old).astype(int)-(newdistarray[:,t[1]]==new).astype(int)
#		pdb.set_trace()
		newpd[orlen+t[0]]+=changes
		newpd[:,orlen+t[0]]+=changes
		record += 1
#	newpd = np.tril(newpd,-1)
#	newpd += newpd.transpose()
	np.fill_diagonal(newpd,0)
	inds = Counter([t[0] for t in targets]).keys()
	for ind in inds:
		seq = implist[ind]
		implist[ind] = SeqRecord(Seq(''.join(newdistarray[ind+orlen])), id=seq.id, name=seq.name, description=seq.description, annotations=seq.annotations)
	newalign = MultipleSeqAlignment(origlist+implist)
	newalign.pd, newalign.distarray = newpd, newdistarray
	return record, newalign, targets
def replace_gaps(aln):
    """we need to determine actual starts of alignments"""
    new_aln = MultipleSeqAlignment([], generic_dna)
    for taxon in aln:
        seq = replace_gaps_at_start_and_ends(taxon.seq)
        new_aln.append(SeqRecord(seq, id=taxon.id, name=taxon.name, description=taxon.description))
    return new_aln
Пример #17
0
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        # TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
Пример #18
0
def json_to_Bio_alignment(seq_json):
	from Bio.Align import MultipleSeqAlignment
	from Bio.SeqRecord import SeqRecord
	from Bio.Seq import Seq
	aln = MultipleSeqAlignment([])
	for seq in seq_json:
		aln.append(SeqRecord(name=seq['strain'], id=seq['strain'], seq=Seq(seq['seq'])))
	return aln
Пример #19
0
def gap_span(reads, bases):
    """
    Returns a MSA with rows=reads and columns=bases, composed of gaps only
    """
    spal = MultipleSeqAlignment(alphabet)
    span = "".join("-" * bases)
    for r in reads:
        spal.append(Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(span, alphabet), id=r))
    return spal
Пример #20
0
 def stage_two_trimming(self, s1_trimmed, window_size=5):
     """
     Alignment row-by-row trimming.  After stage one trimming, iterate
     over rows of alignment to find differences between the alignment
     consensus and the row of data.  Trim those ends coming before
     (or after at 3' end) a block of 5 contiguous highly conserved
     positions.  Goes to third round of filtering to remove edges that
     end up with only '----' characters to start or end alignment block.
     """
     # create new alignment object to hold trimmed alignment
     s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
     # get consensus of alignment in array form
     consensus_array = numpy.array(list(self._alignment_consensus(s1_trimmed)))
     # iterate over each alignment sequence
     for sequence in s1_trimmed:
         #if sequence.id == 'phaenicophaeus_curvirostris2':
         #    pdb.set_trace()
         start, end = self._get_ends(sequence)
         # convert sequence to array
         orig_seq_array = numpy.array(list(sequence))
         # trim down edge gaps so they do not exert undue influence
         # on the running average
         seq_array = orig_seq_array[start:end]
         compare = (seq_array == consensus_array[start:end])
         weight = numpy.repeat(1.0, window_size) / window_size
         # compute running average across window size
         running_average = numpy.convolve(compare, weight, 'same')
         # get first 5' and 3' positions where quality > 1 over
         # 5 positions ([True, True, True, True, True]). This helps
         # us find the ends of the alignment where there are likely
         # problems)
         gm = (running_average > 0.99)
         for i in xrange(gm.size):
             # get 5 value slices
             if numpy.all(gm[i:i+5] == True):
                 bad_start = i
                 break
         reversed_gm = gm[::-1]
         for i in xrange(reversed_gm.size):
             # get 5 value slices
             if numpy.all(reversed_gm[i:i+5] == True):
                 bad_end = reversed_gm.size - i
                 break
         orig_seq_array[:start + bad_start] = '-'
         orig_seq_array[start + bad_end:] = '-'
         trim = ''.join(orig_seq_array)
         # feed those up to replacement engine to set all
         # missing/trimmed data at edges to "?" which is
         # missing data designator
         #trim = self._replace_ends(trim)
         if set(trim) != set(['-']) and set(trim) != (['?']):
             s2_trimmed.append(self._record_formatter(trim, sequence.id))
         else:
             s2_trimmed = None
             break
     return s2_trimmed
Пример #21
0
    def __init__(self, records="", name=None, alphabet=default_codon_alphabet):

        MultipleSeqAlignment.__init__(self, records, alphabet=alphabet)

        # check the type of the alignment to be nucleotide
        for rec in self:
            if not isinstance(rec.seq, CodonSeq):
                raise TypeError("CodonSeq objects are expected in each " "SeqRecord in CodonAlignment")

        assert self.get_alignment_length() % 3 == 0, "Alignment length is not a triple number"
Пример #22
0
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
                SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"),
                SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"),
                SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(str(m), """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Пример #23
0
def bam2Alignment(sam_name, chrom=None, start=None, stop=None, minlen=1):
    """
    Read alignment from samfile and return Alignment object.
    """
    it = sam_name.fetch(chrom, start, stop)
    aln = MultipleSeqAlignment(alphabet)
    for read in it:
        if read.rlen - start + read.pos + 1 > minlen and stop - read.pos + 1 >= minlen:
            aln.append(getSeqRecord(read, start=start, stop=stop))

    return aln
Пример #24
0
def refactor_title_allmsa(msa):
    """
    refactors titles of sequence in format needed for histoneDB seeds
    """
    msa_r=MultipleSeqAlignment([])
    for i in msa:
        print i.description
        # genus=re.search(r"\[(\S+)\s+.+\S+\]",i.description).group(1)
        text=re.search(r"(\S+)\|(\d+)\|(\S+)",i.id)
        i.id=text.group(3)+"|"+text.group(1)+"|"+text.group(2)
        # i.description=genus+"_"+variant+"_"+gi
        msa_r.append(i)
    return msa_r
Пример #25
0
    def __init__(self, records='', name=None, alphabet=default_codon_alphabet):
        """Initialize the class."""
        MultipleSeqAlignment.__init__(self, records, alphabet=alphabet)

        # check the type of the alignment to be nucleotide
        for rec in self:
            if not isinstance(rec.seq, CodonSeq):
                raise TypeError("CodonSeq objects are expected in each "
                                "SeqRecord in CodonAlignment")

        if self.get_alignment_length() % 3 != 0:
            raise ValueError("Alignment length is not a multiple of "
                             "three (i.e. a whole number of codons)")
Пример #26
0
def aln_undup(alignment):
    """Removes duplicate keys"""
    aln=MultipleSeqAlignment([])
    checksums = set()
    for record in alignment:
        checksum = seguid(record.seq)
        if checksum in checksums:
            print "Ignoring %s" % record.id
            continue
        checksums.add(checksum)
        aln.append(record)

    return aln
Пример #27
0
def remove_tribolium(fname) :

    msa = AlignIO.read(open(fname), "fasta")
    newmsa = MultipleSeqAlignment([])
    for record in msa :

	header = record.description.split()
	species = header[1].split('=')[1]
	geneid = header[2].split('=')[1]
	if species != 'tribolium_castaneum' :
	    newmsa.append(record)

    return newmsa
Пример #28
0
    def roundTwo(self):
        self.roundOne()
        self.jfileMinus = []   
        pattern = re.compile(r'_')
        for j in self.query_name:
            clust_id = pattern.split(j)[2]
            for query in SeqIO.parse(j, 'fasta'):
                seq = pattern.split(query.id)[0]
                #Create special identifier for each round of files
                number = 'minus%s' %seq
                id_jfile = '%s_minus%s' %(clust_id,seq)
                
                rax_name = 'reversatest%s' % number
                fasta_name = 'testing/align%s.fasta' % number
                
                if not os.path.isfile('testing/alignment%s.phy' % number):
                    edited = MultipleSeqAlignment([])
                    openPhy = open('testing/alignment.phy')
                    record = AlignIO.read(openPhy, 'phylip')
                    for i in record:  
                        if i.id != seq:
                            edited.append(i)
                            
                    #write the alignment minus a sequence
                    phy_name = 'testing/alignment%s.phy' % number
                    out = open(phy_name , 'w')
                    AlignIO.write(edited, out, 'phylip')
                    out.close()
                    #convert FASTA to PHYLIP format 
                    SeqIO.convert(phy_name, 'phylip', fasta_name, 'fasta', )
    
                    #Create reference tree
                    raxml_line = RaxmlCommandline(sequences=phy_name, model='GTRGAMMA', name=rax_name, working_dir=self.cwPath)
                    raxml_line()
                
                #Add query sequences to the previous alignment
                multiali_name = 'testing/multiple_ali%s.fasta' %id_jfile
                
                if not os.path.isfile('testing/alignment%s.phy' %id_jfile):
                    os.system('mafft --add %s --quiet --reorder %s >%s'% (j, fasta_name, multiali_name))  
                    
                    jason_name = 'multiple_ali%s.jplace' %id_jfile

                    #wrap pplacer
                    if not os.path.isfile('pplacer/%s' %jason_name):
                        self.jfileMinus.append(jason_name)
                        os.system('pplacer --out-dir pplacer  -p -t testing/RAxML_result.%s -s testing/RAxML_info.%s %s' % (rax_name, rax_name, multiali_name))       
        print self.jfile
        print self.jfileMinus
       
        return self.jfile, self.jfileMinus
Пример #29
0
def split_msa(fname) :
    msa = AlignIO.read(open(fname), "fasta")
    msalist = []

    specieslist = []
    newmsa = MultipleSeqAlignment([])

    for record in msa :

        header = record.description.split()
        species = header[1].split('=')[1]

        if not 'tribolium_castaneum' in specieslist :
            newmsa.append(record)
	    specieslist.append(species)
	elif species != 'tribolium_castaneum' :
	    newmsa.append(record)
	    specieslist.append(species)
	else :
	    msalist.append(newmsa)
	    newmsa = MultipleSeqAlignment([])
	    newmsa.append(record)
	    specieslist.append(species)

    if newmsa :
	msalist.append(newmsa)

    return msalist
Пример #30
0
 def split_alignment(clc, alignment, genelimit):
     """Split a multiple sequence alignment into a dict of sequences"""
     # genelimit convert:
     sequences = {}
     if isinstance(alignment, dict):
         alignment = MSA(alignment.values())
     exp_len = alignment.get_alignment_length()
     for dt in genelimit:
         gene, start, end = dt
         sequences[gene] = alignment[:, start:end]
         exp_len -= sequences[gene].get_alignment_length()
     if exp_len != 0:
         raise ValueError("Could not split alignment, wrong gene delimiter")
     return sequences
Пример #31
0
    def get_spliced(self, starts, ends, strand=1):
        """Return a multiple alignment of the exact sequence range provided.

        Accepts two lists of start and end positions on target_seqname, representing
        exons to be spliced in silico.  Returns a *MultipleSeqAlignment* of the
        desired sequences spliced together.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).

        To ask for the alignment portion corresponding to the first 100
        nucleotides of the reference sequence, you would use
        ``search([0], [100])``
        """
        # validate strand
        if strand not in (1, -1):
            raise ValueError("Strand must be 1 or -1, got %s" % str(strand))

        # pull all alignments that span the desired intervals
        fetched = list(self.search(starts, ends))

        # keep track of the expected letter count
        # (sum of lengths of [start, end) segments,
        # where [start, end) half-open)
        expected_letters = sum(end - start for start, end in zip(starts, ends))

        # if there's no alignment, return filler for the assembly of the length given
        if len(fetched) == 0:
            return MultipleSeqAlignment(
                [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)]
            )

        # find the union of all IDs in these alignments
        all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq}

        # split every record by base position
        # key: sequence name
        # value: dictionary
        #        key: position in the reference sequence
        #        value: letter(s) (including letters
        #               aligned to the "-" preceding the letter
        #               at the position in the reference, if any)
        split_by_position = {seq_name: {} for seq_name in all_seqnames}

        # keep track of what the total number of (unspliced) letters should be
        total_rec_length = 0

        # track first strand encountered on the target seqname
        ref_first_strand = None

        for multiseq in fetched:
            # find the target_seqname in this MultipleSeqAlignment and use it to
            # set the parameters for the rest of this iteration
            for seqrec in multiseq:
                if seqrec.id == self._target_seqname:
                    try:
                        if ref_first_strand is None:
                            ref_first_strand = seqrec.annotations["strand"]

                            if ref_first_strand not in (1, -1):
                                raise ValueError("Strand must be 1 or -1")
                        elif ref_first_strand != seqrec.annotations["strand"]:
                            raise ValueError(
                                "Encountered strand='%s' on target seqname, "
                                "expected '%s'"
                                % (seqrec.annotations["strand"], ref_first_strand)
                            )
                    except KeyError:
                        raise ValueError(
                            "No strand information for target seqname (%s)"
                            % self._target_seqname
                        ) from None
                    # length including gaps (i.e. alignment length)
                    rec_length = len(seqrec)
                    rec_start = seqrec.annotations["start"]
                    ungapped_length = seqrec.annotations["size"]
                    # inclusive end in zero-based coordinates of the reference
                    rec_end = rec_start + ungapped_length - 1
                    # This is length in terms of actual letters in the reference
                    total_rec_length += ungapped_length

                    # blank out these positions for every seqname
                    for seqrec in multiseq:
                        for pos in range(rec_start, rec_end + 1):
                            split_by_position[seqrec.id][pos] = ""

                    break
            # http://psung.blogspot.fr/2007/12/for-else-in-python.html
            # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
            else:
                raise ValueError(
                    "Did not find %s in alignment bundle" % (self._target_seqname,)
                )

            # the true, chromosome/contig/etc position in the target seqname
            real_pos = rec_start

            # loop over the alignment to fill split_by_position
            for gapped_pos in range(0, rec_length):
                for seqrec in multiseq:
                    # keep track of this position's value for the target seqname
                    if seqrec.id == self._target_seqname:
                        track_val = seqrec.seq[gapped_pos]

                    # Here, a real_pos that corresponds to just after a series of "-"
                    # in the reference will "accumulate" the letters found in other sequences
                    # in front of the "-"s
                    split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos]

                # increment the real_pos counter only when non-gaps are found in
                # the target_seqname, and we haven't reached the end of the record
                if track_val != "-" and real_pos < rec_end:
                    real_pos += 1

        # make sure the number of bp entries equals the sum of the record lengths
        if len(split_by_position[self._target_seqname]) != total_rec_length:
            raise ValueError(
                "Target seqname (%s) has %s records, expected %s"
                % (
                    self._target_seqname,
                    len(split_by_position[self._target_seqname]),
                    total_rec_length,
                )
            )

        # translates a position in the target_seqname sequence to its gapped length
        realpos_to_len = {
            pos: len(gapped_fragment)
            for pos, gapped_fragment in split_by_position[self._target_seqname].items()
            if len(gapped_fragment) > 1
        }

        # splice together the exons
        subseq = {}

        for seqid in all_seqnames:
            seq_split = split_by_position[seqid]
            seq_splice = []

            filler_char = "N" if seqid == self._target_seqname else "-"

            # iterate from start to end, taking bases from split_by_position when
            # they exist, using N or - for gaps when there is no alignment.
            append = seq_splice.append

            for exonstart, exonend in zip(starts, ends):
                # exonend is exclusive
                for real_pos in range(exonstart, exonend):
                    # if this seqname has this position, add it
                    if real_pos in seq_split:
                        append(seq_split[real_pos])
                    # if not, but it's in the target_seqname, add length-matched filler
                    elif real_pos in realpos_to_len:
                        append(filler_char * realpos_to_len[real_pos])
                    # it's not in either, so add a single filler character
                    else:
                        append(filler_char)

            subseq[seqid] = "".join(seq_splice)

        # make sure we're returning the right number of letters
        if len(subseq[self._target_seqname].replace("-", "")) != expected_letters:
            raise ValueError(
                "Returning %s letters for target seqname (%s), expected %s"
                % (
                    len(subseq[self._target_seqname].replace("-", "")),
                    self._target_seqname,
                    expected_letters,
                )
            )

        # check to make sure all sequences are the same length as the target seqname
        ref_subseq_len = len(subseq[self._target_seqname])

        for seqid, seq in subseq.items():
            if len(seq) != ref_subseq_len:
                raise ValueError(
                    "Returning length %s for %s, expected %s"
                    % (len(seq), seqid, ref_subseq_len)
                )

        # finally, build a MultipleSeqAlignment object for our final sequences
        result_multiseq = []

        for seqid, seq in subseq.items():
            seq = Seq(seq)

            seq = seq if strand == ref_first_strand else seq.reverse_complement()

            result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description=""))

        return MultipleSeqAlignment(result_multiseq)
Пример #32
0
# Get protein sequence
ppb = Bio.PDB.PPBuilder()
polypeptides = ppb.build_peptides(structure)
seq1 = polypeptides[0].get_sequence()
seq2 = polypeptides[1].get_sequence()

matrix = matlist.blosum62
gap_open = -10
gap_extend = -0.5

alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
top_aln = alns[0]

alignment = MultipleSeqAlignment(
    [SeqRecord(Seq(top_aln[0])),
     SeqRecord(Seq(top_aln[1]))])
structure_alignment = Bio.PDB.StructureAlignment(alignment, structure[0]['A'],
                                                 structure[0]['B'])

sup = Bio.PDB.Superimposer()
ref_atoms = []
mov_atoms = []
for duo in structure_alignment.duos:
    res1 = duo[0]
    res2 = duo[1]
    if res1 and res2:
        ref_atoms.append(res1['CA'])
        mov_atoms.append(res2['CA'])

sup.set_atoms(ref_atoms, mov_atoms)
Пример #33
0
def subset_viruses_nextstrain_build(virus, subtype, gene, window, min_seqs,
                                    year_max, year_min):

    configs = readin_virus_config(virus)
    standard_gene = standardize_gene_name(virus, gene)

    #Find reference, alignment and meta files (some sub-genic regions may use files from a gene or a whole genome)
    if 'specify_location' in configs[standard_gene].keys():
        parent_gene = configs[standard_gene]['specify_location']['parent_gene']
        reference_file = configs['reference_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=parent_gene)
        alignment_file = configs['alignment_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=parent_gene)
        meta_file = configs['meta_file'].format(virus=virus,
                                                subtype=subtype,
                                                gene=parent_gene)
        #some are comma-separated, some are tab-separated
        metafile_sep = configs['metafile_sep']
    else:
        reference_file = configs['reference_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=gene)
        alignment_file = configs['alignment_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=gene)
        meta_file = configs['meta_file'].format(virus=virus,
                                                subtype=subtype,
                                                gene=gene)
        metafile_sep = configs['metafile_sep']

    #Find gene location, if domain is sub-genic or reference file contains multiple genes
    gene_location = False
    #If domain is sub-genic, fetch its position (within genome or parent gene) from config file

    if 'specify_location' in configs[standard_gene].keys():
        if subtype == None:
            gene_location_key = "location"
        else:
            gene_location_key = "location_" + str(subtype)

        gene_location_list = ast.literal_eval(
            configs[standard_gene]['specify_location'][gene_location_key])
        #Need to deal with domains the are not contiguous
        if len(gene_location_list) == 1:
            gene_location = SeqFeature(
                FeatureLocation(gene_location_list[0][0],
                                gene_location_list[0][1]))
        else:
            compound_locations = []
            for location in gene_location_list:
                compound_locations.append(
                    FeatureLocation(location[0], location[1]))
            gene_location = CompoundLocation(compound_locations)

    #Find gene location from reference files
    else:
        for seq_record in SeqIO.parse(reference_file, "genbank"):
            for feature in seq_record.features:
                if feature.type == 'CDS':
                    if 'gene' in feature.qualifiers.keys():
                        if feature.qualifiers['gene'][0].lower() == gene.lower(
                        ):
                            gene_location = feature.location
                    elif feature.qualifiers['product'][0].lower(
                    ) == gene.lower():
                        gene_location = feature.location

    #Subset data based on time windows
    meta = pd.read_csv(meta_file, sep=metafile_sep)
    meta.drop(meta[meta['date'] == '?'].index, inplace=True)
    meta.dropna(subset=['date'], inplace=True)
    meta['year'] = meta['date'].str[:4].astype('int')
    if year_max:
        meta.drop(meta[meta['year'] > year_max].index, inplace=True)
    if year_min:
        meta.drop(meta[meta['year'] < year_min].index, inplace=True)

    date_range = meta['year'].max() - meta['year'].min()
    #Remove egg- and cell-passaged strains
    meta.drop(meta[meta['strain'].str[-4:] == '-egg'].index, inplace=True)
    meta.drop(meta[meta['strain'].str[-5:] == '-cell'].index, inplace=True)

    #Limit meta data to only strains in alignment file
    aligned_isolates = []
    with open(alignment_file, "r") as aligned_handle:
        for isolate in SeqIO.parse(aligned_handle, "fasta"):
            aligned_isolates.append(isolate.id)
    aligned_isolates_df = pd.DataFrame(aligned_isolates, columns=['strain'])
    meta = meta.merge(aligned_isolates_df, on='strain', how='inner')

    #Group viruses by time windows
    virus_time_subset = {}
    if window == 'all':
        years = str(meta['year'].min()) + '-' + str(meta['year'].max())
        virus_time_subset[years] = meta['strain'].tolist()
    else:
        date_window_start = meta['year'].min()
        date_window_end = meta['year'].min() + window
        while date_window_end <= meta['year'].max():
            years = str(date_window_start) + '-' + str(date_window_end)
            strains = meta[(meta['year'] >= date_window_start) & (
                meta['year'] < date_window_end)]['strain'].tolist()
            virus_time_subset[years] = strains

            #sliding window
            date_window_end += 1
            date_window_start += 1

    #Only use time points with enough data:
    virus_time_subset = {
        k: v
        for k, v in virus_time_subset.items() if len(v) >= min_seqs
    }

    year_windows = []
    seqs_in_window = []

    #Find outgroup sequence from strains at first time point(to make consensus from)
    first_window = True
    first_window_strains = []
    first_window_sequences = []

    alignment_time_subset = {}

    for years, subset_viruses in virus_time_subset.items():

        year_windows.append(years)
        seqs_in_window.append(len(subset_viruses))
        alignment_time_subset[years] = []

        #make consensus sequence at first time point
        if first_window == True:
            first_window_strains += subset_viruses
            first_window = False

        with open(alignment_file, "r") as aligned_handle:
            for isolate in SeqIO.parse(aligned_handle, "fasta"):
                if isolate.id in first_window_strains:
                    if gene_location:
                        gene_record = SeqRecord(seq=gene_location.extract(
                            isolate.seq),
                                                id=isolate.id,
                                                description=gene)
                    else:
                        gene_record = SeqRecord(seq=isolate.seq,
                                                id=isolate.id,
                                                description=gene)
                    first_window_sequences.append(gene_record)
                if isolate.id in subset_viruses:
                    if gene_location:
                        alignment_time_subset[years].append(
                            gene_location.extract(isolate.seq))
                    else:
                        alignment_time_subset[years].append(isolate.seq)

    first_window_alignment = MultipleSeqAlignment(first_window_sequences)
    outgroup_seq = AlignInfo.SummaryInfo(first_window_alignment).gap_consensus(
        ambiguous='N')
    outgroup_seq_aa = outgroup_seq.translate()

    return virus_time_subset, alignment_time_subset, outgroup_seq, outgroup_seq_aa, year_windows, seqs_in_window
Пример #34
0
    def __next__(self):

        handle = self.handle

        try:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                raise StopIteration

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            # Read in the rest of this alignment header,
            # try and discover the number of records expected
            # and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            # And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        # Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    # identifier, seq start position, seq, seq end position
                    # (an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start == end:
                        # Special case, either a single letter is present,
                        # or no letters at all.
                        if seq.replace("-", "") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-", "") != "", repr(line)
                        start = int(start) - 1  # python counting
                        end = int(end)

                    # The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        # Record the start
                        seq_starts.append(start)

                    # Check the start...
                    if start == end:
                        assert seq.replace("-", "") == "", line
                    else:
                        assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               start, line)

                    seqs[index] += seq

                    # Check the end ...
                    assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               seq_starts[index], end, line)

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    # just a start value, this is just alignment annotation (?)
                    # print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                # Just a spacer?
                pass
            else:
                print(line)
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================":
                # End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = []
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                # EMBOSS 2.9.0 is known to use spaces instead of minus signs
                # for leading gaps, and thus fails to parse.  This old version
                # is still used as of Dec 2008 behind the EBI SOAP webservice:
                # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            records.append(
                SeqRecord(Seq(seq, self.alphabet), id=id, description=id))
        return MultipleSeqAlignment(records, self.alphabet)
Пример #35
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None and \
                self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, "
                             "told to expect %i" %
                             (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            if "." in s:
                raise ValueError(_NO_DOTS)
            seqs.append([s])

        # Look for further blocks
        line = ""
        while True:
            # Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if self._is_header(line):
                # Looks like the start of a concatenated alignment
                self._header = line
                break

            # print "New block..."
            for i in range(number_of_seqs):
                s = line.strip().replace(" ", "")
                if "." in s:
                    raise ValueError(_NO_DOTS)
                seqs[i].append(s)
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line:
                break  # end of file

        records = (SeqRecord(Seq("".join(s), self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Пример #36
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    return MultipleSeqAlignment((SeqRecord(Seq(s,alphabet), id="sequence%i"%(i+1)) \
                                 for (i,s) in enumerate(sequences)),
                                alphabet)
Пример #37
0
def _thetaEK(records, optimize_dist):
    """
    Divergence time based optimized blosum scores for amino acid alignment
    @records - Alignment record object
    @optimize_dist - divergence time
        """
    #posScore = dict()
    posScore = {
        "B62": dict(),
        "KMAT": dict(),
        "K1": dict(),
        "K2": dict(),
        "K3": dict(),
        "K4": dict(),
        "K5": dict(),
        "K6": dict(),
        "K7": dict(),
        "K8": dict(),
        "K9": dict(),
        "K10": dict()
    }
    msaObj = MultipleSeqAlignment(records)
    KIDERA = kidera()
    for i in range(len(msaObj[1])):
        posVector = msaObj[:, i:i + 1]
        print(i)
        if len(set([str(x.seq) for x in posVector])) == 1:
            continue
        pairs = combinations(range(len(posVector)), 2)
        posScore["B62"][i] = list()
        posScore["KMAT"][i] = list()
        for key, val in posScore.items():
            posScore[key][i] = list()
        for (
                m, n
        ) in pairs:  # This for loop has N*(N-1) number of pairs cause N*(N-1) iterations where N is the number of sequences. This needs to be parallelized
            if str(posVector[m].seq) == "?":
                posVector[m].seq = "-"
            if str(posVector[n].seq) == "?":
                posVector[n].seq = "-"
            posScore["B62"][i].append(
                float(
                    Blossum(
                        str(posVector[m].seq).upper(),
                        str(posVector[n].seq).upper())) /
                optimize_dist[str(posVector[m].id) + "-" +
                              str(posVector[n].id)])
            posScore["KMAT"][i].append(
                float(
                    KMAT(
                        str(posVector[m].seq).upper(),
                        str(posVector[n].seq).upper())) /
                optimize_dist[str(posVector[m].id) + "-" +
                              str(posVector[n].id)])
            for j in range(10):
                posScore["K" + str(j + 1)][i].append(
                    (KIDERA[str(posVector[m].seq).upper()][j] -
                     KIDERA[str(posVector[n].seq).upper()][j]) /
                    optimize_dist[str(posVector[m].id) + "-" +
                                  str(posVector[n].id)])

    return posScore
Пример #38
0
def mugration_inference(tree=None,
                        seq_meta=None,
                        field='country',
                        confidence=True,
                        infer_gtr=True,
                        root_state=None,
                        missing='?'):
    from treetime import GTR
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio import Phylo

    T = Phylo.read(tree, 'newick')
    nodes = {n.name: n for n in T.get_terminals()}

    # Determine alphabet only counting tips in the tree
    places = set()
    for name, meta in seq_meta.items():
        if field in meta and name in nodes:
            places.add(meta[field])
    if root_state is not None:
        places.add(root_state)

    # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
    places = sorted(places)
    nc = len(places)
    if nc > 180:
        print("ERROR: geo_inference: can't have more than 180 places!")
        return None, None
    elif nc == 1:
        print(
            "WARNING: geo_inference: only one place found -- set every internal node to %s!"
            % places[0])
        return None, None
    elif nc == 0:
        print("ERROR: geo_inference: list of places is empty!")
        return None, None
    else:
        # set up model
        alphabet = {chr(65 + i): place for i, place in enumerate(places)}
        model = GTR.custom(pi=np.ones(nc, dtype=float) / nc,
                           W=np.ones((nc, nc)),
                           alphabet=np.array(sorted(alphabet.keys())))

        missing_char = chr(65 + nc)
        alphabet[missing_char] = missing
        model.profile_map[missing_char] = np.ones(nc)
        model.ambiguous = missing_char
        alphabet_rev = {v: k for k, v in alphabet.items()}

        # construct pseudo alignment
        pseudo_seqs = []
        for name, meta in seq_meta.items():
            if name in nodes:
                s = alphabet_rev[
                    meta[field]] if field in meta else missing_char
                pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name))
        aln = MultipleSeqAlignment(pseudo_seqs)

        # set up treetime and infer
        from treetime import TreeAnc
        tt = TreeAnc(tree=tree,
                     aln=aln,
                     gtr=model,
                     convert_upper=False,
                     verbose=0)
        tt.use_mutation_length = False
        tt.infer_ancestral_sequences(infer_gtr=infer_gtr,
                                     store_compressed=False,
                                     pc=5.0,
                                     marginal=True,
                                     normalized_rate=False)

        # attach inferred states as e.g. node.region = 'africa'
        for node in tt.tree.find_clades():
            node.__setattr__(field, alphabet[node.sequence[0]])

        # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03
        if confidence:
            for node in tt.tree.find_clades():
                pdis = node.marginal_profile[0]
                S = -np.sum(pdis * np.log(pdis + TINY))

                marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i])
                            for i in range(len(tt.gtr.alphabet))]
                marginal.sort(key=lambda x: x[1],
                              reverse=True)  # sort on likelihoods
                marginal = [
                    (a, b) for a, b in marginal if b > 0.001
                ][:4]  #only take stuff over .1% and the top 4 elements
                conf = {a: b for a, b in marginal}
                node.__setattr__(field + "_entropy", S)
                node.__setattr__(field + "_confidence", conf)

        return tt, alphabet
os.system("head data/muscle-patato_pep.clw")
from Bio import AlignIO
aln_patato = AlignIO.read("data/muscle-patato_pep.clw", "clustal")
print aln_patato
for record in aln_patato:
   print("%s - %s" % (record.seq[1:60], record.id))

os.system("head data/dummy_aln.phy")
aln_dummy = AlignIO.parse("data/dummy_aln.phy", "phylip")
for alignment in aln_dummy:
    print alignment
    print ""

alignments = list(AlignIO.parse("data/dummy_aln.phy", "phylip"))
second_aln = alignments[1]
print second_aln

from Bio.Alphabet import generic_dna
from Bio.Align import MultipleSeqAlignment
align1 = MultipleSeqAlignment([
    SeqRecord(Seq("ACTGCTAGCTAG", generic_dna), id="toto"),
    SeqRecord(Seq("ACT-CTAGCTAG", generic_dna), id="titi"),
    SeqRecord(Seq("ACTGCTAGDTAG", generic_dna), id="tata"),
])

print align1

my_alignments = [align1, aln_patato]
AlignIO.write(my_alignments, "mixed.phy", "phylip")

Пример #40
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        # Whitelisted headers we know about.
        known_headers = [
            "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"
        ]
        # Examples in "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001
        # would often start as follows:
        #
        # !!AA_MUTIPLE_ALIGNMENT 1.0
        # PileUp of: @/usr/users2/culhane/...
        #
        # etc with other seemingly free format text before getting to the
        # MSF/Type/Check line and the following Name: lines block and // line.
        #
        # MUSCLE just has a line "PileUp", while other sources just use the line
        # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT"
        # (nucleotide).
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known GCG MSF header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        while line and " MSF: " not in line:
            line = handle.readline()

        if not line:
            raise ValueError(
                "Reached end of file without MSF/Type/Check header line")

        # Quoting from "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001.
        # Page 31:
        #
        # "Header information is before a .. (double dot) in a GCG format file.
        #  The file will also have a checksum specific for that file."
        #
        # This was followed by a single non-aligned sequence, but this convention
        # appears to also be used in the GCG MSF files. Quoting other examples in
        # this reference, page 31:
        #
        # localpileup_17.msf  MSF: 195  Type: P  January 6, 2000 15:41  Check: 4365 ..
        #
        # Except from page 148:
        #
        # localpileup_106.msf  MSF: 457  Type: P  November 28, 2000 16:09  Check: 2396 ..
        #
        # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum:
        #
        #   MSF: 689  Type: N  Check: 0000  ..
        #
        # By observation, the MSF value is the column count, type is N (nucleotide)
        # or P (protein / amino acid).
        #
        # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown,
        #
        # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf
        # !!NA_MULTIPLE_ALIGNMENT 1.0
        #
        #   stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 ..
        #
        #   Name: G26680     Len: 633  Check: 4334 Weight: 1.00
        #   Name: G26685     Len: 633  Check: 3818 Weight: 1.00
        #   Name: G29385     Len: 633  Check:  391 Weight: 1.00
        #
        # //
        #
        parts = line.strip("\n").split()
        offset = parts.index("MSF:")
        if (parts[offset + 2] != "Type:"
                or parts[-3] not in ("Check:", "CompCheck:")
                or parts[-1] != ".."):
            raise ValueError(
                "GCG MSF header line should be "
                "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', "
                " not: %r" % line)
        try:
            aln_length = int(parts[offset + 1])
        except ValueError:
            aln_length = -1
        if aln_length < 0:
            raise ValueError(
                "GCG MSF header line should have MDF: <int> for column count, not %r"
                % parts[offset + 1])
        seq_type = parts[offset + 3]
        if seq_type not in ["P", "N"]:
            raise ValueError(
                "GCG MSF header line should have 'Type: P' (protein) "
                "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type)

        # There should be a blank line after that header line, then the Name: lines
        #
        # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here,
        #
        # PileUp
        #
        #
        #
        #    MSF:  628  Type: P    Check:   147   ..
        #
        #  Name: AK1H_ECOLI/1-378 oo  Len:  628  Check:  3643  Weight:  1.000
        #  Name: AKH_HAEIN/1-382 oo  Len:  628  Check:  6504  Weight:  1.000
        #
        # //
        ids = []
        lengths = []
        checks = []
        weights = []
        line = handle.readline()
        while line and line.strip() != "//":
            line = handle.readline()
            if line.strip().startswith("Name: "):
                if " Len: " in line and " Check: " in line and " Weight: " in line:
                    rest = line[line.index("Name: ") + 6:].strip()
                    name, rest = rest.split(" Len: ")
                    length, rest = rest.split(" Check: ")
                    check, weight = rest.split(" Weight: ")
                    name = name.strip()
                    if name.endswith(" oo"):
                        # T-COFFEE oddity, ignore this
                        name = name[:-3]
                    if name in ids:
                        raise ValueError("Duplicated ID of %r" % name)
                    if " " in name:
                        raise NotImplementedError("Space in ID %r" % name)
                    ids.append(name)
                    # Expect aln_length <= int(length.strip()), see below
                    lengths.append(int(length.strip()))
                    checks.append(int(check.strip()))
                    weights.append(float(weight.strip()))
                else:
                    raise ValueError("Malformed GCG MSF name line: %r" % line)
        if not line:
            raise ValueError(
                "End of file while looking for end of header // line.")

        if aln_length != max(lengths):
            # In broken examples from IMGTHLA was possible to continue
            # https://github.com/ANHIG/IMGTHLA/issues/201
            max_length = max(lengths)
            max_count = sum(1 for _ in lengths if _ == max_length)
            raise ValueError(
                "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s"
                % (aln_length, max_count, len(ids), max_length))

        line = handle.readline()
        if not line:
            raise ValueError("End of file after // line, expected sequences.")
        if line.strip():
            raise ValueError(
                "After // line, expected blank line before sequences.")

        # Now load the sequences
        seqs = [[] for _ in ids]  # list of empty lists
        completed_length = 0
        while completed_length < aln_length:
            # Note might have a coordinate header line (seems to be optional)
            for idx, name in enumerate(ids):
                line = handle.readline()
                if idx == 0 and not line.strip():
                    # T-COFFEE uses two blank lines between blocks, rather than one
                    while line and not line.strip():
                        line = handle.readline()
                if not line:
                    raise ValueError(
                        "End of file where expecting sequence data.")
                # print("Looking for seq for %s in line: %r" % (name, line))
                words = line.strip().split()
                # Should we use column numbers, rather than assuming no spaces in names?
                if idx == 0 and words and words[0] != name:
                    # print("Actually have a coord line")
                    # Hopefully this is a coordinate header before the first seq
                    try:
                        i = int(words[0])
                    except ValueError:
                        i = -1
                    if i != completed_length + 1:
                        raise ValueError(
                            "Expected GCG MSF coordinate line starting %i, got: %r"
                            % (completed_length + 1, line))
                    if len(words) > 1:
                        # Final block usually not full 50 chars, so expect start only.
                        if len(words) != 2:
                            i = -1
                        else:
                            try:
                                i = int(words[1])
                            except ValueError:
                                i = -1
                        if i != (completed_length + 50 if completed_length +
                                 50 < aln_length else aln_length):
                            raise ValueError(
                                "Expected GCG MSF coordinate line %i to %i, got: %r"
                                % (
                                    completed_length + 1,
                                    completed_length + 50 if completed_length +
                                    50 < aln_length else aln_length,
                                    line,
                                ))
                    line = handle.readline()
                    words = line.strip().split()
                    # print("Still looking for seq for %s in line: %r" % (name, line))
                # Dealt with any coordinate header line, should now be sequence
                if not words:
                    # Should be sequence here, but perhaps its a short one?
                    if (lengths[idx] < aln_length
                            and len("".join(seqs[idx])) == lengths[idx]):
                        # Is this actually allowed in the format? Personally I would
                        # expect a line with name and a block of trailing ~ here.
                        pass
                    else:
                        raise ValueError("Expected sequence for %s, got: %r" %
                                         (name, line))
                elif words[0] == name:
                    assert len(words) > 1, line
                    # print(i, name, repr(words))
                    seqs[idx].extend(words[1:])
                else:
                    raise ValueError("Expected sequence for %r, got: %r" %
                                     (name, line))
            # TODO - check the sequence lengths thus far are consistent
            # with blocks of 50?
            completed_length += 50
            line = handle.readline()
            if line.strip():
                raise ValueError("Expected blank line, got: %r" % line)

        # Skip over any whitespace at the end...
        while True:
            line = handle.readline()
            if not line:
                # End of file, no more alignments
                break
            elif not line.strip():
                # Blank line, ignore
                pass
            elif line.strip().split()[0] in known_headers:
                # Looks like the start of another alignment:
                self._header = line
                break
            else:
                raise ValueError(
                    "Unexpected line after GCG MSF alignment: %r" % line)

        # Combine list of strings into single string, remap gaps
        seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs]

        # Apply any trailing padding for short sequences
        padded = False
        for idx, (length, s) in enumerate(zip(lengths, seqs)):
            if len(s) < aln_length and len(s) == length:
                padded = True
                seqs[idx] = s + "-" * (aln_length - len(s))
        if padded:
            import warnings
            from Bio import BiopythonParserWarning

            warnings.warn(
                "One of more alignment sequences were truncated and have been gap padded",
                BiopythonParserWarning,
            )

        records = (SeqRecord(
            Seq(s),
            id=i,
            name=i,
            description=i,
            annotations={"weight": w},
        ) for (i, s, w) in zip(ids, seqs, weights))

        # This will check alignment lengths are self-consistent:
        align = MultipleSeqAlignment(records)
        # Check matches the header:
        if align.get_alignment_length() != aln_length:
            raise ValueError(
                "GCG MSF headers said alignment length %i, but have %i" %
                (aln_length, align.get_alignment_length()))
        return align
Пример #41
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

    Arguments:
     - sequences - A list (or iterator) of SeqRecord objects, or (if using
       Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
       (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    Note if providing a file handle, your code should close the handle
    after calling this function (to ensure the data gets flushed to disk).

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(handle, SeqRecord):
        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
    if isinstance(handle, list):
        # e.g. list of SeqRecord objects
        raise TypeError("Check arguments, handle should NOT be a list")

    if isinstance(sequences, SeqRecord):
        # This raised an exception in older versions of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        # Map the file format to a writer function/class
        if format in _FormatToString:
            format_function = _FormatToString[format]
            count = 0
            for record in sequences:
                fp.write(format_function(record))
                count += 1
        elif format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            # Try and turn all the records into a single alignment,
            # and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %r" % alignment_count
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %r" \
            % (format, count)

    return count
Пример #42
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier "
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError("Found %i records in this alignment, told to expect %i"
                                 % (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id, name=name, description=id,
                                   annotations={"accession": name})
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #43
0
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([])
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        temp_seq2 = SeqRecord(Seq(str(new_seq)), id=str(record.id))
                        align_2.extend([temp_seq2])
                        #align_2.extend(str(record.id), str(new_seq))

                    else:
                        temp_seq3 = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        align_2.extend([temp_seq3])
                        #align_2.extend(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")
Пример #44
0
    def __next__(self):
        handle = self.handle
        line = handle.readline()

        if not line:
            raise StopIteration

        # Strip out header comments
        while line and line.strip().startswith('#'):
            line = handle.readline()

        seqs = {}
        seq_regions = {}
        passed_end_alignment = False

        latest_id = None
        while True:
            if not line:
                break  # end of file
            line = line.strip()

            if line.startswith('='):
                # There may be more data, but we've reached the end of this
                # alignment
                break
            elif line.startswith('>'):
                m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
                if not m:
                    m = XMFA_HEADER_REGEX.match(line)
                    if not m:
                        raise ValueError("Malformed header line: %s", line)

                parsed_id = m.group('id')
                parsed_data = {}
                for key in ('start', 'end', 'id', 'strand', 'name',
                            'realname'):
                    try:
                        value = m.group(key)
                        if key == 'start':
                            value = int(value)
                            # Convert to zero based counting
                            if value > 0:
                                value -= 1

                        if key == 'end':
                            value = int(value)
                        parsed_data[key] = value
                    except IndexError:
                        # This will occur if we're asking for a group that
                        # doesn't exist. It's fine.
                        pass
                seq_regions[parsed_id] = parsed_data

                if parsed_id not in self._ids:
                    self._ids.append(parsed_id)

                seqs.setdefault(parsed_id, '')
                latest_id = parsed_id
            else:
                assert not passed_end_alignment
                if latest_id is None:
                    raise ValueError("Saw sequence before definition line")
                seqs[latest_id] += line
            line = handle.readline()

        assert len(seqs) <= len(self._ids)

        self.ids = self._ids
        self.sequences = seqs

        if self._ids and seqs:
            alignment_length = max(map(len, list(seqs.values())))
            records = []
            for id in self._ids:
                if id not in seqs or len(seqs[id]) == 0 \
                        or len(seqs[id]) == 0:
                    seq = '-' * alignment_length
                else:
                    seq = seqs[id]

                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )

                # Sometimes we don't see a particular sequence in the
                # alignment, so we skip that record since it isn't present in
                # that LCB/alignment
                if id not in seq_regions:
                    continue

                if (seq_regions[id]['start'] != 0
                        or seq_regions[id]['end'] != 0):
                    suffix = '/{start}-{end}'.format(**seq_regions[id])
                    if 'realname' in seq_regions[id]:
                        corrected_id = seq_regions[id]['realname']
                    else:
                        corrected_id = seq_regions[id]['name']
                    if corrected_id.count(suffix) == 0:
                        corrected_id += suffix
                else:
                    if 'realname' in seq_regions[id]:
                        corrected_id = seq_regions[id]['realname']
                    else:
                        corrected_id = seq_regions[id]['name']

                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=corrected_id,
                                   name=id)

                record.annotations["start"] = seq_regions[id]['start']
                record.annotations["end"] = seq_regions[id]['end']
                record.annotations[
                    "strand"] = 1 if seq_regions[id]['strand'] == '+' else -1

                records.append(record)
            return MultipleSeqAlignment(records, self.alphabet)
        else:
            raise StopIteration
Пример #45
0
        try:
            print(next(SeqIO.parse(h, t_format, given_alpha)))
            h.close()
            assert False, "Forcing wrong alphabet, %s, should fail (%s)" \
                   % (repr(given_alpha), t_filename)
        except ValueError:
            #Good - should fail
            pass
        h.close()
    del good, bad, given_alpha, base_alpha

    if t_alignment:
        print("Testing reading %s format file %s as an alignment" \
              % (t_format, t_filename))

        alignment = MultipleSeqAlignment(
            SeqIO.parse(handle=t_filename, format=t_format))
        assert len(alignment) == t_count

        alignment_len = alignment.get_alignment_length()

        #Check the record order agrees, and double check the
        #sequence lengths all agree too.
        for i in range(t_count):
            assert compare_record(records[i], alignment[i])
            assert len(records[i].seq) == alignment_len

        print(alignment_summary(alignment))

    #Some alignment file formats have magic characters which mean
    #use the letter in this position in the first sequence.
    #They should all have been converted by the parser, but if
Пример #46
0
def write_alignments(alignments,
                     outfile=None,
                     shading_modes=["similar"],
                     logo=True,
                     hideseqs=False,
                     splitN=20,
                     secondary_structure=True,
                     save_dir=""):
    """
    """
    if outfile is None:
        n2 = str(uuid.uuid4())
        outfile = "alignment_{}".format(n2)
    with open(os.path.join(save_dir, "{}.tex".format(outfile)), "w") as tex:
        print >> tex, "\\documentclass[11pt,landscape]{article}"
        print >> tex, "\\usepackage{hyperref}"
        print >> tex, "\\usepackage[paperwidth={}in, paperheight=18in]{{geometry}}".format(
            22 / 200. * 200 + 2.5)
        print >> tex, "\\usepackage{texshade}\n"
        print >> tex, "\\begin{document}"
        for aln in alignments:
            if isinstance(aln, str):
                name = os.path.basename(aln)
                msa = MultipleSeqAlignment(list(SeqIO.parse(aln, "fasta")))
            elif isinstance(aln, MultipleSeqAlignment):
                msa = aln
                name = aln.annotations.get("name", "HistoneDB")
            else:
                raise RuntimeError(
                    "Invalid alignments: Must be a path to a FASTA format or a BioPython MultipleSequenceAlignment object."
                )

            write_alignment(tex,
                            msa,
                            name,
                            shading_modes=shading_modes,
                            logo=logo,
                            hideseqs=hideseqs,
                            splitN=splitN,
                            secondary_structure=secondary_structure,
                            save_dir=save_dir)
        print >> tex, "\\end{document}"

    #Turn latex into pdf
    pdflatex = os.path.join(os.path.dirname(sys.executable), "pdflatex")
    # print pdflatex
    # print os.path.join(save_dir, "{}.tex".format(outfile))
    #assert 0, " ".join([pdflatex, "--file-line-error", "--synctex=1", "-output-directory={}".format(save_dir), "--save-size=10000", os.path.join(save_dir, "{}.tex".format(outfile))])
    process = Popen([
        pdflatex, "--file-line-error", "--synctex=1",
        "-output-directory={}".format(save_dir), "--save-size=10000",
        os.path.join(save_dir, "{}.tex".format(outfile))
    ])
    process.communicate()

    #assert 0,
    #assert os.path.exists(os.path.join(save_dir, "{}.pdf".format(outfile))),"Where are you?"

    os.remove(os.path.join(save_dir, "{}.tex".format(outfile)))
    os.remove(os.path.join(save_dir, "{}.aux".format(outfile)))
    os.remove(os.path.join(save_dir, "{}.log".format(outfile)))
    os.remove(os.path.join(save_dir, "{}.out".format(outfile)))

    for fasta_part in glob.glob(
            os.path.join(save_dir, "{}_*.fasta".format(name))):
        os.remove(fasta_part)

    return os.path.join(save_dir, "{}.pdf".format(outfile))
Пример #47
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None and \
                self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, "
                             "told to expect %i" %
                             (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            while len(s) < length_of_seqs:
                # The sequence may be split into multiple lines
                line = handle.readline().strip()
                if not line:
                    break
                if line == "":
                    continue
                s = "".join([s, line.strip().replace(" ", "")])
                if len(s) > length_of_seqs:
                    raise ValueError("Found a record of length %i, "
                                     "should be %i" % (len(s), length_of_seqs))
            if "." in s:
                raise ValueError(_NO_DOTS)
            seqs.append(s)
        while True:
            # Find other alignments in the file
            line = handle.readline()
            if not line:
                break
            if self._is_header(line):
                self._header = line
                break

        records = (SeqRecord(Seq(s, self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Пример #48
0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None,
                                weights=None, verbose=0, iterations=5):
    """take a set of discrete states associated with tips of a tree
    and reconstruct their ancestral states along with a GTR model that
    approximately maximizes the likelihood of the states on the tree.

    Parameters
    ----------
    tree : str, Bio.Phylo.Tree
        name of tree file or Biopython tree object
    traits : dict
        dictionary linking tips to straits
    missing_data : str, optional
        string indicating missing data
    pc : float, optional
        number of pseudo-counts to be used during GTR inference, default 1.0
    sampling_bias_correction : float, optional
        factor to inflate overall switching rate by to counteract sampling bias
    weights : str, optional
        name of file with equilibirum frequencies
    verbose : int, optional
        level of verbosity in output
    iterations : int, optional
        number of times non-linear optimization of overall rate and
        transmission estimation are iterated

    Returns
    -------
    tuple
        tuple of treeanc object, forward and reverse alphabets

    Raises
    ------
    TreeTimeError
        raise error if ancestral reconstruction errors out
    """
    unique_states = sorted(set(traits.values()))
    nc = len(unique_states)
    if nc>180:
        print("mugration: can't have more than 180 states!", file=sys.stderr)
        return None, None, None
    elif nc<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return None, None, None

    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################
    alphabet = [chr(65+i) for i,state in enumerate(unique_states)]
    missing_char = chr(65+nc)
    letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)}
    letter_to_state[missing_char]=missing_data
    reverse_alphabet = {v:k for k,v in letter_to_state.items()}

    ###########################################################################
    ### construct gtr model
    ###########################################################################
    if type(weights)==str:
        tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',',
                             skipinitialspace=True)
        weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()}
        mean_weight = np.mean(list(weights.values()))
        weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float)
        weights/=weights.sum()
    else:
        weights = None

    # set up dummy matrix
    W = np.ones((nc,nc), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous=missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose,
                      convert_upper=False, one_mutation=0.001)
    treeanc.use_mutation_length = False
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[traits[n.name]]
                           if n.name in traits else missing_char))
                   for n in treeanc.tree.get_terminals()]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    try:
        ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=pc, marginal=True, normalized_rate=False,
            fixed_pi=weights, reconstruct_tip_states=True)
        treeanc.optimize_gtr_rate()
    except TreeTimeError as e:
        print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n")
        raise e

    for i in range(iterations):
        treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc)
        treeanc.optimize_gtr_rate()

    if sampling_bias_correction:
        treeanc.gtr.mu *= sampling_bias_correction

    treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                 marginal=True, normalized_rate=False, reconstruct_tip_states=True)

    print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. "
          "TreeTime now optimizes the overall rate numerically and thus allows for long branches "
          "along which multiple changes accumulated. This is expected to affect estimates of the "
          "overall rate while leaving the relative rates mostly unchanged."))

    return treeanc, letter_to_state, reverse_alphabet
Пример #49
0
def goANI_dnds_calculation(fna1, faa1, fna2, faa2, gedb, debug=False):
    '''
    This is a threadable command to determine the dn/ds of two genomes
    based on a list of genes

    Arguments:
        fna1 : .fna file of genome1
        faa1 : .faa file of genome1
        fna2 : .fna file of genome2
        faa2 : .faa file of genome2
        gedb : datatable listing the genes to align and calculate dn/ds for

    Returns:
        dndb : data-table containing raw dn/ds information
    '''
    # load .fasta files
    g1n = SeqIO.to_dict(
        SeqIO.parse(fna1, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()))
    g1a = SeqIO.to_dict(SeqIO.parse(faa1, 'fasta', alphabet=IUPAC.protein))
    g2n = SeqIO.to_dict(
        SeqIO.parse(fna2, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()))
    g2a = SeqIO.to_dict(SeqIO.parse(faa2, 'fasta', alphabet=IUPAC.protein))

    # set up aligner
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    #print(MatrixInfo.blosum62)
    aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
    aligner.open_gap_score = -12
    aligner.extend_gap_score = -3

    # set up table
    table = defaultdict(list)

    # for every gene-pair to align
    j = 0
    for i, row in gedb.iterrows():
        try:
            # get the sequences
            a1 = g1a[row['qry_id']]
            if a1[-1] == '*':
                a1 = a1[:-1]
            a2 = g2a[row['sbj_id']]
            if a2[-1] == '*':
                a2 = a2[:-1]

            s1 = g1n[row['qry_id']]
            s2 = g2n[row['sbj_id']]

            # alingn them
            alignments = aligner.align(a1.seq, a2.seq)

            # Arbitrary cutoff to make sure this doesn't bug out
            if len(alignments) > 10000:
                print("Ahhh! {0} vs {1} has {2} alignments".format(
                    row['qry_id'], row['sbj_id'], len(alignments)))
                raise Exception('Too many alignments exception')

            # convert to multi-sequence alignment
            alignment = min(alignments)
            ass = str(alignment).split('\n')
            msa = MultipleSeqAlignment([
                SeqRecord(Seq(ass[0], alphabet=IUPAC.protein)),
                SeqRecord(Seq(ass[-2], alphabet=IUPAC.protein))
            ])

            # convert to codon alignment
            codon_aln = Bio.codonalign.build(msa, [s1, s2])

            # calculate dn/ds on the codon alignment
            dS, S, dN, N = custom_dn_ds(codon_aln._records[0],
                                        codon_aln._records[1])

            # save
            table['qry_id'].append(row['qry_id'])
            table['sbj_id'].append(row['sbj_id'])
            table['S_changed'].append(dS)
            table['S_sites'].append(S)
            table['N_changed'].append(dN)
            table['N_sites'].append(N)

            j += 1
            if debug:
                if j >= 10:
                    break

        except Exception as e:
            print("Alignment exception- {0}".format(e))
            table['qry_id'].append(row['qry_id'])
            table['sbj_id'].append(row['sbj_id'])
            table['S_changed'].append(0)
            table['S_sites'].append(0)
            table['N_changed'].append(0)
            table['N_sites'].append(0)

    dnDb = pd.DataFrame(table)

    return dnDb
Пример #50
0
MSA0 = JAlign.readlines()
'''Next turn the line in to a sequence object'''
MSA = []
MSAList =[]
count=1
for seq in MSA0:
    Seqlist = list(seq[:-1])
    Seq1 = Seq(''.join(Seqlist))
    Seqlist = [Seqlist[x] for x in range(len(Seqlist)) if (Seqlist[x] != '-' and Seqlist[x].upper()==Seqlist[x])]
    SEQ = Seq(''.join(Seqlist))
    MSA.append(SeqRecord(Seq1,id=str(count)))
    MSAList.append(SEQ)
    count+=1

MSA = MultipleSeqAlignment(MSA)


'''indices for the maps'''
def AlignIndices(MSAline):
    MSAline = list(MSAline)
    Indices = [x for x in range(len(MSAline)) if (MSAline[x] != '-' and MSAline[x].upper()==MSAline[x])]
    return(Indices)


def TreeDistMat(AlignObject):
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(AlignObject)

    return(dm)
Пример #51
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

    Arguments:
     - sequences - A list (or iterator) of SeqRecord objects, or a single
       SeqRecord.
     - handle    - File handle object to write to, or filename as string.
     - format    - lower case string describing the file format to write.

    Note if providing a file handle, your code should close the handle
    after calling this function (to ensure the data gets flushed to disk).

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, str):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if not format.islower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(handle, SeqRecord):
        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
    if isinstance(handle, list):
        # e.g. list of SeqRecord objects
        raise TypeError("Check arguments, handle should NOT be a list")

    if isinstance(sequences, SeqRecord):
        # This raised an exception in older versions of Biopython
        sequences = [sequences]

    # Map the file format to a writer function/class
    format_function = _FormatToString.get(format)
    if format_function is not None:
        count = 0
        with as_handle(handle, "w") as fp:
            for record in sequences:
                fp.write(format_function(record))
                count += 1
        return count

    writer_class = _FormatToWriter.get(format)
    if writer_class is not None:
        count = writer_class(handle).write_file(sequences)
        if not isinstance(count, int):
            raise RuntimeError(
                "Internal error - the underlying %s writer "
                "should have returned the record count, not %r" %
                (format, count))
        return count

    if format in AlignIO._FormatToWriter:
        # Try and turn all the records into a single alignment,
        # and write that using Bio.AlignIO
        alignment = MultipleSeqAlignment(sequences)
        alignment_count = AlignIO.write([alignment], handle, format)
        if alignment_count != 1:
            raise RuntimeError("Internal error - the underlying writer "
                               "should have returned 1, not %r" %
                               alignment_count)
        count = len(alignment)
        return count

    if format in _FormatToIterator or format in AlignIO._FormatToIterator:
        raise ValueError("Reading format '%s' is supported, but not writing" %
                         format)

    raise ValueError("Unknown format '%s'" % format)
Пример #52
0
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None,
                                weights=None, verbose=0):
    unique_states = sorted(set(traits.values()))
    nc = len(unique_states)
    if nc>180:
        print("mugration: can't have more than 180 states!", file=sys.stderr)
        return 1
    elif nc<2:
        print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr)
        return 1

    ###########################################################################
    ### make a single character alphabet that maps to discrete states
    ###########################################################################
    alphabet = [chr(65+i) for i,state in enumerate(unique_states)]
    missing_char = chr(65+nc)
    letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)}
    letter_to_state[missing_char]=missing_data
    reverse_alphabet = {v:k for k,v in letter_to_state.items()}

    ###########################################################################
    ### construct gtr model
    ###########################################################################
    if type(weights)==str:
        tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',',
                             skipinitialspace=True)
        weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()}
        mean_weight = np.mean(list(weights.values()))
        weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float)
        weights/=weights.sum()
    else:
        weights = None

    # set up dummy matrix
    W = np.ones((nc,nc), dtype=float)

    mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet))
    mugration_GTR.profile_map[missing_char] = np.ones(nc)
    mugration_GTR.ambiguous=missing_char

    ###########################################################################
    ### set up treeanc
    ###########################################################################
    treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose,
                      convert_upper=False, one_mutation=0.001)
    treeanc.use_mutation_length = False
    pseudo_seqs = [SeqRecord(id=n.name,name=n.name,
                   seq=Seq(reverse_alphabet[traits[n.name]]
                           if n.name in traits else missing_char))
                   for n in treeanc.tree.get_terminals()]
    treeanc.aln = MultipleSeqAlignment(pseudo_seqs)

    ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True,
            store_compressed=False, pc=pc, marginal=True, normalized_rate=False,
            fixed_pi=weights)

    if ndiff==ttconf.ERROR: # if reconstruction failed, exit
        return 1

    if sampling_bias_correction:
        treeanc.gtr.mu *= sampling_bias_correction
        treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False,
                                     marginal=True, normalized_rate=False)
    return treeanc, letter_to_state, reverse_alphabet
Пример #53
0
    return features

    #prof=cons_prof(alignment)
    #pylab.plot(prof)


if __name__ == '__main__':
    human_h2a_z_core = Seq(
        'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG'
    )
    xenopus_h2a_core = Seq(
        'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP'
    )

    # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG')
    msa = MultipleSeqAlignment(
        [SeqRecord(xenopus_h2a_core, id='H2A', name='H2A')])
    features = get_hist_ss_in_aln_for_shade(msa, below=True)
    # features=[{'style':'fill:$\uparrow$','sel':[5,10],'text':'test'}]
    print(features)
    shade_aln2png(msa,
                  filename='default',
                  shading_modes=['charge_functional'],
                  legend=False,
                  features=features,
                  title='',
                  logo=False,
                  hideseqs=False,
                  splitN=20,
                  setends=[],
                  ruler=False,
                  show_seq_names=False,
Пример #54
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()

        q = _extract_alignment_region(query_seq, query_tags)
        if tool in ["TFASTX"] and len(match_seq) == len(q):
            m = match_seq
            # Quick hack until I can work out how -, * and / characters
            # and the apparent mix of aa and bp coordinates works.
        else:
            m = _extract_alignment_region(match_seq, match_tags)
        if len(q) != len(m):
            message = """Darn... amino acids vs nucleotide coordinates?
            tool: {0}
            query_seq: {1}
            query_tags: {2}
            {3} length: {4}
            match_seq: {5}
            match_tags: {6}
            {7} length: {8}
            handle.name: {9}
            """.format(tool, query_seq, query_tags, q, len(q), match_seq,
                       match_tags, m, len(m), handle.name)
            raise ValueError(message)

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #55
0
    def next(self):
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an MultipleSeqAlignment object containing two rows.
        """
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        if line.startswith("#"):
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>"):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line:
            #End of file
            raise StopIteration
        if ">>><<<" in line:
            #Reached the end of the alignments, no need to read the footer...
            raise StopIteration

        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>":
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None, 1)[0])

        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">":
            query_seq_parts.append(line.strip())
            line = handle.readline()

        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError(
                "Expected line starting '>' and ending '..', got '%s'" %
                repr(line))
        assert match_descr.startswith(line[1:].split(None, 1)[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; ":
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else:
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(
            query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(
            match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.

        if len(query_align_seq) != len(match_align_seq):
            raise ValueError(
                "Problem parsing the alignment sequence coordinates, "
                "following should be the same length but are not:\n"
                "%s - len %i\n%s - len %i" %
                (query_align_seq, len(query_align_seq), match_align_seq,
                 len(match_align_seq)))
        if "sw_overlap" in alignment_annotation:
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems():
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(query_align_seq, alphabet),
            id=self._query_descr.split(None, 1)[0].strip(","),
            name="query",
            description=self._query_descr,
            annotations={"original_length": int(query_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
            if query_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(match_align_seq, alphabet),
            id=match_descr.split(None, 1)[0].strip(","),
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_annotation["al_start"])
        record._al_stop = int(match_annotation["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
            if match_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #56
0
    def setUp(self):
        # Test set 1
        seq1 = SeqRecord(Seq(
            "TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro1")
        seq2 = SeqRecord(Seq(
            "TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro2")
        pro1 = SeqRecord(Seq("SGTARTKLLLLLAALCAAGGALE",
                             alphabet=IUPAC.protein),
                         id="pro1")
        pro2 = SeqRecord(Seq("SGTSRTKRLLLLAALGAAGGALE",
                             alphabet=IUPAC.protein),
                         id="pro2")
        aln1 = MultipleSeqAlignment([pro1, pro2])
        self.aln1 = aln1
        self.seqlist1 = [seq1, seq2]
        # Test set 2
        #                      M  K  K  H  E L(F)L  C  Q  G  T  S  N  K  L  T  Q(L)L  G  T  F  E  D  H  F  L  S  L  Q  R  M  F  N  N  C  E  V  V
        seq3 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro1")
        # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
        seq4 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro2")
        # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC  TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3')
        seq5 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro3")
        pro3 = SeqRecord(Seq(
            "MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL",
            alphabet=IUPAC.protein),
                         id="pro1")
        pro4 = SeqRecord(Seq(
            "MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL",
            alphabet=IUPAC.protein),
                         id="pro2")
        pro5 = SeqRecord(Seq(
            "MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL",
            alphabet=IUPAC.protein),
                         id="pro3")
        aln2 = MultipleSeqAlignment([pro3, pro4, pro5])
        self.aln2 = aln2
        self.seqlist2 = [seq3, seq4, seq5]

        # Test set 3
        # use Yeast mitochondrial codon table
        seq6 = SeqRecord(Seq(
            "ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro6")
        seq7 = SeqRecord(Seq(
            "ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro7")
        seq8 = SeqRecord(Seq(
            "ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG",
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                         id="pro8")
        pro6 = SeqRecord(Seq("MARDHPVGHWYDRVYLQSSNTSFTKTIQ",
                             alphabet=IUPAC.protein),
                         id="pro6")
        pro7 = SeqRecord(Seq("MARHHPVEHWYDRVYLQSSNVSTTKTIQ",
                             alphabet=IUPAC.protein),
                         id="pro7")
        pro8 = SeqRecord(Seq("MAGDHPVGHWYDRVYTQSSNHSFTMTIQ",
                             alphabet=IUPAC.protein),
                         id="pro8")
        aln3 = MultipleSeqAlignment([pro6, pro7, pro8])
        self.aln3 = aln3
        self.seqlist3 = [seq6, seq7, seq8]
        self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
Пример #57
0
def MafIterator(handle, seq_count=None):
    """Iterate over a MAF file handle as MultipleSeqAlignment objects.

    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names.
    """
    in_a_bundle = False

    annotations = []
    records = []

    while True:
        # allows parsing of the last bundle without duplicating code
        try:
            line = next(handle)
        except StopIteration:
            line = ""

        if in_a_bundle:
            if line.startswith("s"):
                # add a SeqRecord to the bundle
                line_split = line.strip().split()

                if len(line_split) != 7:
                    raise ValueError(
                        "Error parsing alignment - 's' line must have 7 fields"
                    )

                # convert MAF-style +/- strand to biopython-type 1/-1
                if line_split[4] == "+":
                    strand = 1
                elif line_split[4] == "-":
                    strand = -1
                else:
                    # TODO: issue warning, set to 0?
                    strand = 1

                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                anno = {
                    "start": int(line_split[2]),
                    "size": int(line_split[3]),
                    "strand": strand,
                    "srcSize": int(line_split[5]),
                }

                sequence = line_split[6]

                # interpret a dot/period to mean the same as the first sequence
                if "." in sequence:
                    if not records:
                        raise ValueError(
                            "Found dot/period in first sequence of alignment"
                        )

                    ref = str(records[0].seq)
                    new = []

                    for (letter, ref_letter) in zip(sequence, ref):
                        new.append(ref_letter if letter == "." else letter)

                    sequence = "".join(new)

                records.append(
                    SeqRecord(
                        Seq(sequence),
                        id=line_split[1],
                        name=line_split[1],
                        description="",
                        annotations=anno,
                    )
                )
            elif line.startswith("i"):
                # TODO: information about what is in the aligned species DNA before
                # and after the immediately preceding "s" line
                pass
            elif line.startswith("e"):
                # TODO: information about the size of the gap between the alignments
                # that span the current block
                pass
            elif line.startswith("q"):
                # TODO: quality of each aligned base for the species.
                # Need to find documentation on this, looks like ASCII 0-9 or gap?
                # Can then store in each SeqRecord's .letter_annotations dictionary,
                # perhaps as the raw string or turned into integers / None for gap?
                pass
            elif line.startswith("#"):
                # ignore comments
                # (not sure whether comments
                # are in the maf specification, though)
                pass
            elif not line.strip():
                # end a bundle of records
                if seq_count is not None:
                    assert len(records) == seq_count

                alignment = MultipleSeqAlignment(records)
                # TODO - Introduce an annotated alignment class?
                # See also Bio/AlignIO/FastaIO.py for same requirement.
                # For now, store the annotation a new private property:
                alignment._annotations = annotations

                yield alignment

                in_a_bundle = False

                annotations = []
                records = []
            else:
                raise ValueError(
                    "Error parsing alignment - unexpected line:\n%s" % (line,)
                )
        elif line.startswith("a"):
            # start a bundle of records
            in_a_bundle = True
            annot_strings = line.strip().split()[1:]
            if len(annot_strings) != line.count("="):
                raise ValueError("Error parsing alignment - invalid key in 'a' line")
            annotations = dict(a_string.split("=") for a_string in annot_strings)
        elif line.startswith("#"):
            # ignore comments
            pass
        elif not line:
            break
Пример #58
0
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([])

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        # print(temp_seq)
                        align.extend([temp_seq])


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)
Пример #59
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if line.strip() != "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = {}  # Really only need an OrderedSet, but python lacks this
        gs = {}
        gr = {}
        gf = {}
        gc = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError(
                        "Could not split line into identifier and sequence:\n"
                        + line)
                seq_id, seq = parts
                if seq_id not in ids:
                    ids[seq_id] = True
                seqs.setdefault(seq_id, "")
                seqs[seq_id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    feature, text = line[5:].strip().split(None, 2)
                    if feature not in gc:
                        gc[feature] = ""
                    gc[feature] += text.strip()  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    try:
                        seq_id, feature, text = line[5:].strip().split(None, 2)
                    except ValueError:
                        # Free text can sometimes be empty, which a one line split throws an error for.
                        # See https://github.com/biopython/biopython/issues/2982 for more details
                        seq_id, feature = line[5:].strip().split(None, 1)
                        text = ""
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gs:
                        gs[seq_id] = {}
                    if feature not in gs[seq_id]:
                        gs[seq_id][feature] = [text]
                    else:
                        gs[seq_id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    seq_id, feature, text = line[5:].strip().split(None, 2)
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gr:
                        gr[seq_id] = {}
                    if feature not in gr[seq_id]:
                        gr[seq_id][feature] = ""
                    gr[seq_id][feature] += text.strip(
                    )  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids.keys()
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if (self.records_per_alignment is not None
                    and self.records_per_alignment != len(ids)):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for seq_id in ids:
                seq = seqs[seq_id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(seq_id)
                record = SeqRecord(
                    Seq(seq),
                    id=seq_id,
                    name=name,
                    description=seq_id,
                    annotations={"accession": name},
                )
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(seq_id, record)
                records.append(record)
            for k, v in gc.items():
                if len(v) != alignment_length:
                    raise ValueError("%s length %i, expected %i" %
                                     (k, len(v), alignment_length))
            alignment = MultipleSeqAlignment(records)

            for k, v in sorted(gc.items()):
                if k in self.pfam_gc_mapping:
                    alignment.column_annotations[self.pfam_gc_mapping[k]] = v
                elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping:
                    alignment.column_annotations[self.pfam_gr_mapping[
                        k[:-5]]] = v
                else:
                    # Ignore it?
                    alignment.column_annotations["GC:" + k] = v

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #60
0
    def next(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment