def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: '+adaID+' '+fragment+' iteration '+str(n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration '+str(n_iter)) f.write('\n') return refseq, consensus_final
def make_index_and_hash(data_folder, adaID, fragment, n_iter, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Build stampy hashes: '+adaID+' '+fragment+' iteration '+str(n_iter) # 1. Make genome index file for 6 fragments (chromosomes) call_list = [stampy_bin, '--species="HIV adaID '+adaID+' fragment '+fragment+'"', '--overwrite', '-G', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), get_reference_filename(data_folder, adaID, fragment, n_iter), ] if VERBOSE >= 3: print ' '.join(call_list) sp.call(call_list) # 2. Build a hash file for 6 fragments call_list = [stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), '-H', get_hash_file(data_folder, adaID, fragment, n_iter, ext=False), ] if VERBOSE >= 3: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Made index and hash files for iteration '+str(n_iter)) f.write('\n')
def map_stampy(data_folder, adaID, fragment, n_iter, VERBOSE=0, summary=True): '''Map using stampy''' if VERBOSE: print 'Map via stampy: '+adaID+' '+fragment+' iteration '+str(n_iter) # Input and output files input_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter - 1, type='bam') output_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter, type='sam') # Map call_list = [stampy_bin, '-g', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), '-h', get_hash_file(data_folder, adaID, fragment, n_iter, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Mapped completed for iteration '+str(n_iter)) f.write('\n')
def extract_reads_subsample(data_folder, adaID, fragment, n_reads, VERBOSE=0, summary=True): '''Extract a subsample of reads from the initial sample premapped''' from hivwholeseq.utils.mapping import extract_mapped_reads_subsample input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') output_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter=1, type='bam') n_written = extract_mapped_reads_subsample(input_filename, output_filename, n_reads, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('\n') f.write('Subsample of reads copied: ' + str(n_written)) f.write('\n')
def extract_reads_subsample(data_folder, adaID, fragment, n_reads, VERBOSE=0, summary=True): '''Extract a subsample of reads from the initial sample premapped''' from hivwholeseq.utils.mapping import extract_mapped_reads_subsample input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') output_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter=1, type='bam') n_written = extract_mapped_reads_subsample(input_filename, output_filename, n_reads, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('\n') f.write('Subsample of reads copied: '+str(n_written)) f.write('\n')
def make_index_and_hash(data_folder, adaID, fragment, n_iter, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Build stampy hashes: ' + adaID + ' ' + fragment + ' iteration ' + str( n_iter) # 1. Make genome index file for 6 fragments (chromosomes) call_list = [ stampy_bin, '--species="HIV adaID ' + adaID + ' fragment ' + fragment + '"', '--overwrite', '-G', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), get_reference_filename(data_folder, adaID, fragment, n_iter), ] if VERBOSE >= 3: print ' '.join(call_list) sp.call(call_list) # 2. Build a hash file for 6 fragments call_list = [ stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), '-H', get_hash_file(data_folder, adaID, fragment, n_iter, ext=False), ] if VERBOSE >= 3: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Made index and hash files for iteration ' + str(n_iter)) f.write('\n')
def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: ' + adaID + ' ' + fragment + ' iteration ' + str( n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration ' + str(n_iter)) f.write('\n') return refseq, consensus_final
def map_stampy(data_folder, adaID, fragment, n_iter, VERBOSE=0, summary=True): '''Map using stampy''' if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + fragment + ' iteration ' + str( n_iter) # Input and output files input_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter - 1, type='bam') output_filename = get_mapped_filename(data_folder, adaID, fragment, n_iter, type='sam') # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, fragment, n_iter, ext=False), '-h', get_hash_file(data_folder, adaID, fragment, n_iter, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Mapped completed for iteration ' + str(n_iter)) f.write('\n')
refseq, consensus = make_consensus(data_folder, adaID, fragment, n_iter, VERBOSE=VERBOSE, summary=summary) write_consensus_intermediate(data_folder, adaID, fragment, n_iter, consensus) match = check_new_old_consensi(refseq, consensus) # Start a new round if not converged if (not match) and (n_iter < iterations_max): n_iter += 1 if VERBOSE: print 'Starting again for iteration '+str(n_iter) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('\n') f.write('Starting new iteration '+str(n_iter)) f.write('\n') # or terminate else: write_consensus_final(seq_run, adaID, fragment, consensus) if VERBOSE: if match: print 'Consensus converged at iteration '+str(n_iter)+\ ': adaID', adaID, fragment else: print 'Maximal number of iterations reached: adaID', \ adaID, fragment
VERBOSE=VERBOSE, summary=summary) write_consensus_intermediate(data_folder, adaID, fragment, n_iter, consensus) match = check_new_old_consensi(refseq, consensus) # Start a new round if not converged if (not match) and (n_iter < iterations_max): n_iter += 1 if VERBOSE: print 'Starting again for iteration ' + str(n_iter) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('\n') f.write('Starting new iteration ' + str(n_iter)) f.write('\n') # or terminate else: write_consensus_final(seq_run, adaID, fragment, consensus) if VERBOSE: if match: print 'Consensus converged at iteration '+str(n_iter)+\ ': adaID', adaID, fragment else: print 'Maximal number of iterations reached: adaID', \ adaID, fragment