def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title = ', '.join( map(lambda x: ' '.join([x[0], str(x[1])]), [ ['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: '+adaID+' '+fragment+' iteration '+str(n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration '+str(n_iter)) f.write('\n') return refseq, consensus_final
def get_allele_frequency_trajectories(pname, samples, fragment, qual_min=30, VERBOSE=0): '''Scan the reads of all samples and write to a single file''' if VERBOSE >= 1: print 'Getting allele frequency trajectories:', pname, fragment from hivwholeseq.patients.filenames import get_initial_reference_filename, \ get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \ get_allele_count_trajectories_filename from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \ get_allele_counts_insertions_from_file_unfiltered, \ filter_nus refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') # Prepare output data structures cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int) nus_traj = np.zeros((len(samples), len(alpha), len(refseq))) for it, sample in enumerate(samples): if VERBOSE >= 2: print pname, it, sample input_filename = get_mapped_to_initial_filename(pname, sample, fragment, type='bam') (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) # Take the total counts, blending in the read types cou = counts.sum(axis=0) cos_traj[it] = cou # Take the filtered frequencies, blending in the read types nu = filter_nus(counts) nus_traj[it] = nu #FIXME: test, etc. return (cos_traj, nus_traj)
def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: ' + adaID + ' ' + fragment + ' iteration ' + str( n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration ' + str(n_iter)) f.write('\n') return refseq, consensus_final
def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]), [['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title=', '.join(['run '+seq_run+' '+adaID, 'sample '+samplename, 'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png') return (counts, inserts)
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int( fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int( fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title = ', '.join([ 'run ' + seq_run + ' ' + adaID, 'sample ' + samplename, 'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder + foldername_adapter(adaID) + 'figures/coverage_premapped_' + samplename + '.png') return (counts, inserts)