Пример #1
0
def concatenate(alignments, padding_length=0, partitions=None):

    '''
    Concatenate alignments based on the Seq ids; row order does not
    matter. If one alignment contains a Seq id that another one does
    not, gaps will be introduced in place of the missing Seq.

    Args:
        alignments: (tuple, list) Alignments to be concatenated.

        padding_length: Introduce this many gaps between concatenated
            alignments.
    '''

    from Bio import Alphabet
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Align import MultipleSeqAlignment
    if not isinstance(alignments, (list, tuple)):
        raise ValueError('Argument must be a list or a tuple.')
    elif len(alignments) == 1:
        return alignments[0]
    if isinstance(alignments, tuple):
        alignments = list(alignments)
    aln1 = None
    aln2 = None
    if len(alignments) > 2:
        aln2 = alignments.pop()
        result1 = concatenate(alignments=alignments,
                              padding_length=padding_length,
                              partitions=partitions)
        aln1 = result1[0]
        partitions = result1[1]
    elif len(alignments) == 2:
        aln1 = alignments[0]
        aln2 = alignments[1]
    if (not isinstance(aln1, MultipleSeqAlignment) or
            not isinstance(aln2, MultipleSeqAlignment)):
        raise ValueError(
            'Argument must inherit from Bio.Align.MultipleSeqAlignment.')
    alphabet = Alphabet._consensus_alphabet([aln1._alphabet, aln2._alphabet])
    aln1_dict = dict()
    aln2_dict = dict()
    for aln1_s in aln1:
        aln1_dict[aln1_s.id] = aln1_s
    for aln2_s in aln2:
        aln2_dict[aln2_s.id] = aln2_s
    aln1_length = aln1.get_alignment_length()
    aln2_length = aln2.get_alignment_length()
    aln1_gaps = SeqRecord(Seq('-' * aln1_length, alphabet))
    aln2_gaps = SeqRecord(Seq('-' * aln2_length, alphabet))
    padding = SeqRecord(Seq('N' * padding_length, alphabet))

    if not partitions:
        partitions = [(1, aln1_length)]
    partitions.append((1 + aln1_length, padding_length + aln1_length + aln2_length))

    result_seq_list = list()
    for aln1_key in aln1_dict.keys():
        merged_Seq = None
        if aln1_key in aln2_dict:
            merged_Seq = aln1_dict[aln1_key] + padding + aln2_dict[aln1_key]
            merged_Seq.id = aln1_dict[aln1_key].id
            merged_Seq.name = ''
            merged_Seq.description = ''
            aln2_dict.pop(aln1_key)
        else:
            aln1_seq_record = aln1_dict[aln1_key]
            merged_Seq = aln1_seq_record + padding + aln2_gaps
            merged_Seq.id = aln1_seq_record.id
            merged_Seq.name = ''
            merged_Seq.description = ''
        result_seq_list.append(merged_Seq)
    for aln2_seq_record in aln2_dict.values():
        merged_Seq = aln1_gaps + padding + aln2_seq_record
        merged_Seq.id = aln2_seq_record.id
        merged_Seq.name = ''
        merged_Seq.description = ''
        result_seq_list.append(merged_Seq)
    result_alignment = MultipleSeqAlignment(result_seq_list, alphabet)
    result_alignment.sort()
    return((result_alignment, partitions))
Пример #2
0
    dictionary[key] = valuelist[0:10]
from pprint import pprint

fielddict_file = open("global.dict", "w")
pprint(dictionary, fielddict_file)
fielddict_file.close()

reference = []

for i, j in dictionary.iteritems():
    n = 0
    combined_seq = MultipleSeqAlignment([
        SeqRecord(Seq('', generic_dna), id="hg19"),
        SeqRecord(Seq('', generic_dna), id="panTro4"),
        SeqRecord(Seq('', generic_dna), id="gorGor3"),
        SeqRecord(Seq('', generic_dna), id="rheMac3"),
        SeqRecord(Seq('', generic_dna), id="ponAbe2")
    ])
    combined_seq.sort()
    for ref in j:
        n = n + 1
        seq_records = AlignIO.read(ref, 'fasta')
        seq_records.description = ""
        seq_records.sort()
        combined_seq = combined_seq + seq_records
        combined_seq.description = ""
    with open('%s.ref' % i, 'w') as write_file:
        AlignIO.write(combined_seq, write_file, 'fasta')
    referencelist = open('reference.list', 'a')
    referencelist.write('%s\t%i\n' % (i, n))
Пример #3
0
class virus_clean(object):
	"""docstring for virus_clean"""
	def __init__(self,n_iqd  = 5, **kwargs):
		'''
		parameters
		n_std	-- number of interquartile distances accepted in molecular clock filter 
		'''
		self.n_iqd = n_iqd

	def remove_insertions(self):
		'''
		remove all columns from the alignment in which the outgroup is gapped
		'''
		outgroup_ok = np.array(self.sequence_lookup[self.outgroup['strain']])!='-'
		for seq in self.viruses:
			seq.seq = Seq("".join(np.array(seq.seq)[outgroup_ok]).upper())

	def clean_gaps(self):
		'''
		remove viruses with gaps -- not part of the standard pipeline
		'''
		self.viruses = filter(lambda x: '-' in x.seq, self.viruses)

	def clean_ambiguous(self):
		'''
		substitute all ambiguous characters with '-', 
		ancestral inference will interpret this as missing data
		'''
		for v in self.viruses:
			v.seq = Seq(re.sub(r'[BDEFHIJKLMNOPQRSUVWXYZ]', '-',str(v.seq)))

	def unique_date(self):
		'''
		add a unique numerical date to each leaf. uniqueness is achieved adding a small number
		'''
		from date_util import numerical_date
		og = self.sequence_lookup[self.outgroup['strain']]
		if hasattr(og, 'date'):
			try:
				og.num_date = numerical_date(og.date)
			except:
				print "cannot parse date"
				og.num_date="undefined";
		for ii, v in enumerate(self.viruses):
			if hasattr(v, 'date'):
				try:
					v.num_date = numerical_date(v.date, self.date_format['fields']) + 1e-7*(ii+1)
				except:
					print "cannot parse date"
					v.num_date="undefined";

	def times_from_outgroup(self):
		outgroup_date = self.sequence_lookup[self.outgroup['strain']].num_date
		return np.array([x.num_date-outgroup_date for x in self.viruses  if x.strain])

	def distance_from_outgroup(self):
		from seq_util import hamming_distance
		outgroup_seq = self.sequence_lookup[self.outgroup['strain']].seq
		return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses if x.strain])

	def clean_distances(self):
		"""Remove viruses that don't follow a loose clock """
		times = self.times_from_outgroup()
		distances = self.distance_from_outgroup()
		slope, intercept, r_value, p_value, std_err = stats.linregress(times, distances)
		residuals = slope*times + intercept - distances
		r_iqd = stats.scoreatpercentile(residuals,75) - stats.scoreatpercentile(residuals,25)
		if self.verbose:
			print "\tslope: " + str(slope)
			print "\tr: " + str(r_value)
			print "\tresiduals iqd: " + str(r_iqd)
		new_viruses = []
		for (v,r) in izip(self.viruses,residuals):
			# filter viruses more than n_std standard devitations up or down
			if np.abs(r)<self.n_iqd * r_iqd or v.id == self.outgroup["strain"]:
				new_viruses.append(v)
			else:
				if self.verbose>1:
					print "\t\tresidual:", r, "\nremoved ",v.strain
		self.viruses = MultipleSeqAlignment(new_viruses)

	def clean_generic(self):
		print "Number of viruses before cleaning:",len(self.viruses)
		self.unique_date()
		self.remove_insertions()
		self.clean_ambiguous()
		self.clean_distances()
		self.viruses.sort(key=lambda x:x.num_date)
		print "Number of viruses after outlier filtering:",len(self.viruses)