def process_sample_coverage(job, addresses, keyspace, auth, sample, program, samples): connection.setup(addresses, keyspace, auth_provider=auth) with open("{}.sambamba_coverage.bed".format(samples[sample]['library_name']), 'rb') as coverage: reader = csv.reader(coverage, delimiter='\t') header = reader.next() threshold_indices = list() thresholds = list() index = 0 for element in header: if element.startswith("percentage"): threshold = element.replace('percentage', '') threshold_indices.append(index) thresholds.append(int(threshold)) index += 1 for row in reader: threshold_data = defaultdict(float) index = 0 for threshold in thresholds: threshold_data[threshold] = row[threshold_indices[index]] index += 1 sample_data = SampleCoverage.create(sample=samples[sample]['sample_name'], library_name=samples[sample]['library_name'], run_id=samples[sample]['run_id'], num_libraries_in_run=samples[sample]['num_libraries_in_run'], sequencer_id=samples[sample]['sequencer'], program_name=program, extraction=samples[sample]['extraction'], panel=samples[sample]['panel'], target_pool=samples[sample]['target_pool'], amplicon=row[3], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data) amplicon_data = AmpliconCoverage.create(amplicon=row[3], sample=samples[sample]['sample_name'], library_name=samples[sample]['library_name'], run_id=samples[sample]['run_id'], num_libraries_in_run=samples[sample]['num_libraries_in_run'], sequencer_id=samples[sample]['sequencer'], program_name=program, extraction=samples[sample]['extraction'], panel=samples[sample]['panel'], target_pool=samples[sample]['target_pool'], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data)
def subsample_bam(job, addresses, keyspace, auth, name, samples, config, seed, fraction, iteration): """Use samtools view to subsample an input file to the specified fraction""" library_name = "subsample-{}-{}-{}".format(samples[name]['library_name'], fraction, iteration) sublog = "subsample-{}-{}-{}.log".format(name, fraction, iteration) input_bam = "{}.recalibrated.sorted.bam".format(samples[name]['library_name']) subsampled_bam = "subsample-{}-{}-{}.bam".format(samples[name]['library_name'], fraction, iteration) samcommand = "samtools view -s {seed}.{fraction} -b {input} > {output}".format(seed=seed, fraction=fraction, input=input_bam, output=subsampled_bam) index_command = "samtools index {}".format(subsampled_bam) index_log = "{}.index.log".format(subsampled_bam) output = "{}.sambamba_coverage.bed".format(subsampled_bam) logfile = "{}.sambamba_coverage.log".format(subsampled_bam) command = ("{}".format(config['sambamba']['bin']), "depth region", "-L", "{}".format(samples[name]['regions']), "-t", "{}".format(config['sambamba']['num_cores']), "-T", "{}".format(config['coverage_threshold']), "-T", "{}".format(config['coverage_threshold2']), "{}".format(subsampled_bam), ">", "{}".format(output)) job.fileStore.logToMaster("Samtools ViewCommand: {}\n".format(samcommand)) pipeline.run_and_log_command(samcommand, sublog) job.fileStore.logToMaster("Samtools Index Command: {}\n".format(index_command)) pipeline.run_and_log_command(index_command, index_log) job.fileStore.logToMaster("SamBamba Coverage Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) connection.setup(addresses, keyspace, auth_provider=auth) job.fileStore.logToMaster("Adding coverage data: {}\n".format(samcommand)) num_libs = (float(samples[name]['num_libraries_in_run']) * (1 / (float(fraction) / 100.00))) with open(output, 'rb') as coverage: reader = csv.reader(coverage, delimiter='\t') header = reader.next() threshold_indices = list() thresholds = list() index = 0 for element in header: if element.startswith("percentage"): threshold = element.replace('percentage', '') threshold_indices.append(index) thresholds.append(int(threshold)) index += 1 for row in reader: threshold_data = defaultdict(float) index = 0 for threshold in thresholds: threshold_data[threshold] = row[threshold_indices[index]] index += 1 sample_data = SampleCoverage.create(sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], amplicon=row[3], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data) amplicon_data = AmpliconCoverage.create(amplicon=row[3], sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data)
def subsample_bam(job, addresses, keyspace, auth, name, samples, config, seed, fraction, iteration): """Use samtools view to subsample an input file to the specified fraction""" library_name = "subsample-{}-{}-{}".format(samples[name]['library_name'], fraction, iteration) sublog = "subsample-{}-{}-{}.log".format(name, fraction, iteration) input_bam = "{}.recalibrated.sorted.bam".format( samples[name]['library_name']) subsampled_bam = "subsample-{}-{}-{}.bam".format( samples[name]['library_name'], fraction, iteration) samcommand = "samtools view -s {seed}.{fraction} -b {input} > {output}".format( seed=seed, fraction=fraction, input=input_bam, output=subsampled_bam) index_command = "samtools index {}".format(subsampled_bam) index_log = "{}.index.log".format(subsampled_bam) output = "{}.sambamba_coverage.bed".format(subsampled_bam) logfile = "{}.sambamba_coverage.log".format(subsampled_bam) command = ("{}".format(config['sambamba']['bin']), "depth region", "-L", "{}".format(samples[name]['regions']), "-t", "{}".format(config['sambamba']['num_cores']), "-T", "{}".format(config['coverage_threshold']), "-T", "{}".format(config['coverage_threshold2']), "{}".format(subsampled_bam), ">", "{}".format(output)) job.fileStore.logToMaster("Samtools ViewCommand: {}\n".format(samcommand)) pipeline.run_and_log_command(samcommand, sublog) job.fileStore.logToMaster( "Samtools Index Command: {}\n".format(index_command)) pipeline.run_and_log_command(index_command, index_log) job.fileStore.logToMaster( "SamBamba Coverage Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) connection.setup(addresses, keyspace, auth_provider=auth) job.fileStore.logToMaster("Adding coverage data: {}\n".format(samcommand)) num_libs = (float(samples[name]['num_libraries_in_run']) * (1 / (float(fraction) / 100.00))) with open(output, 'rb') as coverage: reader = csv.reader(coverage, delimiter='\t') header = reader.next() threshold_indices = list() thresholds = list() index = 0 for element in header: if element.startswith("percentage"): threshold = element.replace('percentage', '') threshold_indices.append(index) thresholds.append(int(threshold)) index += 1 for row in reader: threshold_data = defaultdict(float) index = 0 for threshold in thresholds: threshold_data[threshold] = row[threshold_indices[index]] index += 1 sample_data = SampleCoverage.create( sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], amplicon=row[3], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data) amplicon_data = AmpliconCoverage.create( amplicon=row[3], sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data)