def build_local_consensus(seqs, VERBOSE=0, store_allele_counts=False, full_cover=True): '''Build a local consensus from an MSA There is only ONE tricky point: what to do if some reads do not cover the whole block, e.g. at the end of a fragment because of low coverage? If full_cover == False, convert MSA gaps at the end of too short reads into N Args: seqs (list of SeqRecords): seqs to build consensus from store_allele_counts (bool): return also allele counts from the alignment full_cover (bool): if True, assume the reads fully cover the region (no gaps at edges) ''' import numpy as np from hivwholeseq.utils.miseq import alpha from hivwholeseq.utils.mapping import align_muscle ali = np.array(align_muscle(*seqs, sort=True), 'S1', ndmin=2) if full_cover: allele_counts = np.array([(ali == a).sum(axis=0) for a in alpha], int, ndmin=2) else: allele_counts = np.zeros((len(alpha), len(ali[0])),int) for i in xrange(len(seqs)): if ali[i, -1] == '-': first_finalgap = len(ali[i].tostring().rstrip('-')) ali[i, first_finalgap:] = 'X' for ai, a in enumerate(alpha): allele_counts[ai] += ali[i] == a cov = allele_counts.sum(axis=0) allele_counts = allele_counts[:, cov > 0] cons_local = [] for counts in allele_counts.T: # Pick max count nucleotide, ignoring N maxinds = (counts[:-1] == counts.max()).nonzero()[0] if len(maxinds) < 1: cons_local.append('-') continue # Pick a random nucleotide in case of a tie elif len(maxinds) > 1: np.random.shuffle(maxinds) maxind = maxinds[0] cons_local.append(alpha[maxind]) cons_local = np.array(cons_local, 'S1') ind_nongap = cons_local != '-' cons_local = ''.join(cons_local[ind_nongap]) if store_allele_counts: allele_counts = allele_counts[:, ind_nongap] return (cons_local, allele_counts) return cons_local
def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0): '''Find the overlap coordinates for the two fragments''' from hivwholeseq.utils.mapping import align_muscle seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), 'fasta') seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), 'fasta') sm1 = np.array(seq1) sm2 = np.array(seq2) # Find the beginning of s2 in s1 seed_len = 20 matches_min = 16 seed = sm2[:seed_len] found = False trials = 0 while (not found) and (trials < 3): for pos in xrange(len(seq1) - 700, len(seq1) - seed_len): if (sm1[pos: pos + seed_len] == seed).sum() >= matches_min - trials: found = True start_s2 = pos break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print 'Beginning of '+frag2+' found in '+frag1 # In an ideal world, the overlap is a holy place in which no indels happen. # We cannot assume that, sadly. However, we can search from the other side # and align: find the end of s1 in s2 found = False seed = sm1[-seed_len:] trials = 0 while (not found) and (trials < 3): for pos in xrange(700): if (sm2[pos: pos + seed_len] == seed).sum() >= matches_min - trials: found = True end_s1 = pos + seed_len break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print 'End of '+frag1+' found in '+frag2 # Align ali = align_muscle(seq1[start_s2:], seq2[:end_s1]) return (start_s2, end_s1, ali)
def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0): """Find the overlap coordinates for the two fragments""" from hivwholeseq.utils.mapping import align_muscle seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), "fasta") seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), "fasta") sm1 = np.array(seq1) sm2 = np.array(seq2) # Find the beginning of s2 in s1 seed_len = 20 matches_min = 16 seed = sm2[:seed_len] found = False trials = 0 while (not found) and (trials < 3): for pos in xrange(len(seq1) - 700, len(seq1) - seed_len): if (sm1[pos : pos + seed_len] == seed).sum() >= matches_min - trials: found = True start_s2 = pos break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print "Beginning of " + frag2 + " found in " + frag1 # In an ideal world, the overlap is a holy place in which no indels happen. # We cannot assume that, sadly. However, we can search from the other side # and align: find the end of s1 in s2 found = False seed = sm1[-seed_len:] trials = 0 while (not found) and (trials < 3): for pos in xrange(700): if (sm2[pos : pos + seed_len] == seed).sum() >= matches_min - trials: found = True end_s1 = pos + seed_len break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print "End of " + frag1 + " found in " + frag2 # Align ali = align_muscle(seq1[start_s2:], seq2[:end_s1]) return (start_s2, end_s1, ali)
def build_msa(htseqs, VERBOSE=0): '''Build multiple sequence alignment from cluster of haplotypes''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna seqs = [SeqRecord(Seq(seq, ambiguous_dna), id='#'+str(i), name='#'+str(i)) for i, seq in enumerate(htseqs)] from hivwholeseq.utils.mapping import align_muscle ali = align_muscle(*seqs, sort=True) return ali
def build_msa(htseqs, VERBOSE=0): '''Build multiple sequence alignment from cluster of haplotypes''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna seqs = [ SeqRecord(Seq(seq, ambiguous_dna), id='#' + str(i), name='#' + str(i)) for i, seq in enumerate(htseqs) ] from hivwholeseq.utils.mapping import align_muscle ali = align_muscle(*seqs, sort=True) return ali
def build_msa_haplotypes(haploc, VERBOSE=0, label=''): '''Build multiple sequence alignment from cluster of haplotypes''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna seqs = [SeqRecord(Seq(seq, ambiguous_dna), id=label+'count_'+str(count)+'_rank_'+str(i), name=label+'count_'+str(count)+'_rank_'+str(i), description='') for i, (seq, count) in enumerate(haploc.most_common())] from hivwholeseq.utils.mapping import align_muscle ali = align_muscle(*seqs, sort=True) return ali
def write_consensus_final(seq_run, adaID, fragment, consensus): '''Write the final consensus (fragments are now called F5 instead of F5ai)''' dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] samplename = dataset['samples'][dataset['adapters'].index(adaID)] frag_out = fragment[:2] name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus' consensusseq = SeqRecord(Seq(consensus), id=name, name=name) outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') # Align all consensi via muscle and store seqs = list(SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta')) ali = align_muscle(*seqs) AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0): '''Align consensi from different samples in a dataset''' data_folder = dataset['folder'] # Collect consensi if VERBOSE >= 1: print 'Collecting consensi...', consensi = defaultdict(dict) for adaID in adaIDs: samplename = dataset['samples'][dataset['adapters'].index(adaID)] fragments_sample = samples[samplename]['fragments'] for frag in fragments_sample: frag_gen = frag[:2] if frag_gen not in fragments: continue con_fn = get_consensus_filename(data_folder, adaID, frag_gen) if os.path.isfile(con_fn): con = SeqIO.read(con_fn, 'fasta') consensi[frag_gen][adaID] = con if 'genomewide' in fragments: frag_gens = [frag[:2] for frag in fragments_sample] con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens) if os.path.isfile(con_gw_fn): con = SeqIO.read(con_gw_fn, 'fasta') consensi['genomewide'][adaID] = con if VERBOSE >= 1: print 'done.' print 'Aligning...', # Align alis = {} for (frag, con_dict) in consensi.iteritems(): if VERBOSE >= 2: print frag, ali_frag = align_muscle(*(con_dict.values())) alis[frag] = ali_frag if VERBOSE >= 1: print 'done.' return alis
def write_consensus_final(seq_run, adaID, fragment, consensus): '''Write the final consensus (fragments are now called F5 instead of F5ai)''' dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] samplename = dataset['samples'][dataset['adapters'].index(adaID)] frag_out = fragment[:2] name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus' consensusseq = SeqRecord(Seq(consensus), id=name, name=name) outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') # Align all consensi via muscle and store seqs = list( SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta')) ali = align_muscle(*seqs) AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')
accept_holes=(fragment == 'genomewide'), store_allele_counts=store_allele_counts) if store_allele_counts: (consensus, allele_counts) = consensus # Store to file if VERBOSE: print 'Store to file' name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus' consensusseq = SeqRecord(Seq(consensus, ambiguous_dna), id=name, name=name) # Align consensus to reference via muscle and trim end gaps in ref # (improper primer trimming in trim_and_divide) ali = align_muscle(refseq, consensusseq, sort=True) if ali[0][-1] == '-': start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-')) end_nongap = len(ali[0].seq.rstrip('-')) ali = ali[:, start_nongap:end_nongap] if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len( consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format( 100.0 * n_diff / len_ali) + '%)'
block_len_initial=block_len_initial, reads_per_alignment=n_reads_per_ali, accept_holes=(fragment == 'genomewide'), store_allele_counts=store_allele_counts) if store_allele_counts: (consensus, allele_counts) = consensus # Store to file if VERBOSE: print 'Store to file' name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus' consensusseq = SeqRecord(Seq(consensus, ambiguous_dna), id=name, name=name) # Align consensus to reference via muscle and trim end gaps in ref # (improper primer trimming in trim_and_divide) ali = align_muscle(refseq, consensusseq, sort=True) if ali[0][-1] == '-': start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-')) end_nongap = len(ali[0].seq.rstrip('-')) ali = ali[:, start_nongap: end_nongap] if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)' # Ungap consensus
cons_seq.description = ', '.join([ 'Patient: ' + patient.code, time, 'region: ' + region, 'consensus' ]) cons_seq.cell_count = sample['CD4+ count'] cons_seq.viral_load = sample['viral load'] cons_seq.subtype = patient['Subtype'] seqs.append(cons_seq) if use_joint: seqs_all.append(cons_seq) if VERBOSE >= 2: print 'OK' if VERBOSE >= 2: print 'Align', ali = align_muscle(*seqs, sort=True) if VERBOSE >= 2: print 'OK' if use_save: if VERBOSE >= 2: print 'Save alignment', fn_out = patient.get_consensi_alignment_filename(region) mkdirs(os.path.dirname(fn_out)) AlignIO.write(ali, fn_out, 'fasta') if VERBOSE >= 2: print 'OK' if VERBOSE >= 2: print 'Build local tree' tree = build_tree_fasttree(ali, VERBOSE=VERBOSE)
cons_seq.description = ', '.join(['Patient: '+patient.code, time, 'region: '+region, 'consensus']) cons_seq.cell_count = sample['CD4+ count'] cons_seq.viral_load = sample['viral load'] cons_seq.subtype = patient['Subtype'] seqs.append(cons_seq) if use_joint: seqs_all.append(cons_seq) if VERBOSE >= 2: print 'OK' if VERBOSE >= 2: print 'Align', ali = align_muscle(*seqs, sort=True) if VERBOSE >= 2: print 'OK' if use_save: if VERBOSE >= 2: print 'Save alignment', fn_out = patient.get_consensi_alignment_filename(region) mkdirs(os.path.dirname(fn_out)) AlignIO.write(ali, fn_out, 'fasta') if VERBOSE >= 2: print 'OK' if VERBOSE >= 2: print 'Build local tree' tree = build_tree_fasttree(ali, VERBOSE=VERBOSE)