def _symlink_vcfs(callers, datestamp_var_dirpath): errory = [] for caller in callers: info(caller.name) for sample in caller.samples: info(sample.name) filt_vcf_fpath = sample.find_filt_vcf_by_callername(caller.name) if not verify_file(filt_vcf_fpath): errory.append([sample.name, caller.name, filt_vcf_fpath]) else: base_filt_fpath = filt_vcf_fpath[: -3] if filt_vcf_fpath.endswith( '.gz') else filt_vcf_fpath for fpath in [ base_filt_fpath + '.gz', base_filt_fpath + '.idx', base_filt_fpath + '.gz.tbi' ]: if verify_file(fpath, silent=True): _symlink_to_dir(fpath, sample.dirpath) # _symlink_to_dir(fpath, datestamp_var_dirpath) BCBioStructure.move_vcfs_to_var(sample) return errory
def _correct_qualimap_insert_size_histogram(cnf, samples): """ replacing Qualimap insert size histogram with Picard one. """ for s in samples: qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace( 'raw_data_qualimapReport', 'raw_data') qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath) if exists(qualimap1_dirname): if not exists(qualimap2_dirname): shutil.move(qualimap1_dirname, qualimap2_dirname) else: shutil.rmtree(qualimap1_dirname) elif not exists(qualimap2_dirname): continue # no data from both Qualimap v.1 and Qualimap v.2 # if qualimap histogram exits and reuse_intermediate, skip if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and cnf.reuse_intermediate: pass else: if verify_file(s.picard_ins_size_hist_txt_fpath): with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f: one_line_to_stop = False for line in picard_f: if one_line_to_stop: break if line.startswith('## HISTOGRAM'): one_line_to_stop = True with file_transaction( cnf.work_dir, s.qualimap_ins_size_hist_fpath) as tx: with open(tx, 'w') as qualimap_f: for line in picard_f: qualimap_f.write(line)
def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( 'expression', BCBioStructure.expression_dir) step_greetings('Gene expression heatmaps summary for all samples') report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM'] genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf) for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names): counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname) if not verify_file(counts_fpath, silent=True): raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', '')) info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath) annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict) verify_file(counts_fpath, is_critical=True, description=counts_fname) isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath used_dict = transcripts_dict if isoforms_found else genes_dict report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')), counts_fname.replace('.tsv', '') + '.html') make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath, report_caption_name, keep_gene_names=isoforms_found) info('Done')
def index_vcf(cnf, sample_name, filt_vcf_fpath, caller_name=None): if cnf is None: global glob_cnf cnf = glob_cnf info() info(sample_name + ((', ' + caller_name) if caller_name else '') + ': indexing') # for fpath in [pass_vcf_fpath, filt_vcf_fpath]: # if not cnf.reuse_intermediate and not verify_file(fpath, silent=True): # err(fpath + ' does not exist - cannot IGV index') # else: # if cnf.reuse_intermediate and verify_file(fpath + '.idx', silent=True): # info('Reusing existing ' + fpath + '.idx') # else: # igvtools_index(cnf, fpath) if not cnf.reuse_intermediate and not verify_file(filt_vcf_fpath, silent=True): err(filt_vcf_fpath + ' does not exist - cannot gzip and tabix') else: if cnf.reuse_intermediate and verify_file(filt_vcf_fpath + '.gz', silent=True) \ and verify_file(filt_vcf_fpath + '.gz.tbi', silent=True): info(filt_vcf_fpath + '.gz and .gz.tbi exist; reusing') else: bgzip_and_tabix(cnf, filt_vcf_fpath)
def get_chr_lengths_from_seq(seq_fpath): chr_lengths = [] if seq_fpath.endswith('.fai'): seq_fpath = splitext(seq_fpath)[0] if verify_file(seq_fpath + '.fai', silent=True): info('Reading genome index file (.fai) to get chromosome lengths') with open(adjust_path(seq_fpath + '.fai'), 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], line.split()[1] chr_lengths.append((chrom, length)) elif verify_file(seq_fpath, silent=True): info('Reading genome sequence (.fa) to get chromosome lengths') with open(adjust_path(seq_fpath), 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai') return chr_lengths
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath): cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True) cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv') controls = '' lr2gene_opt = '' if cnf.controls: controls = '-c ' + cnf.controls # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')]) lr2gene_opt = '-c' cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals()) call(cnf, cmdline, cov2lr_output, exit_on_error=False) info() if not verify_file(cov2lr_output): return None seq2c_opts = cnf.seq2c_opts or '' lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True) cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals()) res = call(cnf, cmdline, output_fpath, exit_on_error=False) info() if not verify_file(output_fpath): return None return res
def extract_variant_from_bams(cnf, out_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before): padding = 500 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) pos, ref, alt, variant_transcripts = variant['pos'], variant['ref'], variant['alt'], variant['transcripts'] bam_prefix = None for transcript in variant_transcripts: transcript_name = sorted(variant_transcripts)[0] transcript_exons = transcripts[(transcript, chrom)] for idx, exon in enumerate(transcript_exons): if exon['start'] <= pos <= exon['stop']: start, end = exon['start'], exon['stop'] bam_prefix = '{chrom}-{transcript_name}-{idx}-'.format(**locals()) if bam_prefix: break if not bam_prefix: start, end = max(1, pos - padding), min(chr_length, pos + padding) ref_ = ref[:20] alt_ = alt[:20] bam_prefix = '{chrom}-{pos}-{ref_}-{alt_}-'.format(**locals()) bams_by_sample = dict() for sample in samples: sample_name = sample.name.replace('-', '_') output_bam_fpath = join(out_dirpath, bam_prefix + '{sample_name}.bam'.format(**locals())) if output_bam_fpath in bams_created_before: continue if cnf.reuse_intermediate and verify_file(output_bam_fpath, silent=True): bams_by_sample[sample.name] = output_bam_fpath else: cmdline = '{sambamba} slice {sample.bam} {chrom}:{start}-{end} -o {output_bam_fpath}'.format(**locals()) call(cnf, cmdline, silent=not cnf.verbose) if verify_file(output_bam_fpath, silent=True): cmdline = '{sambamba} index {output_bam_fpath}'.format(**locals()) call(cnf, cmdline, silent=not cnf.verbose) bams_by_sample[sample.name] = output_bam_fpath return bams_by_sample
def launch_bedcoverage_hist(work_dir, bed, bam, chr_lengths_fpath, bedcov_output_fpath=None, bedtools='bedtools'): if not bedcov_output_fpath: bedcov_output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '__' + splitext_plus(basename(bam))[0] + '_bedcov_output.txt') if bam.endswith('bam'): bam = bam_to_bed_nocnf(bam, bedtools) verify_file(bam, is_critical=True, description='BAM to BED conversion result') v = bedtools_version(bedtools) if v and v >= 24: cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format( **locals()) else: cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format( **locals()) cmdline += ' > ' + bedcov_output_fpath info(cmdline) os.system(cmdline) res = verify_file(bedcov_output_fpath) if res: info('Done, saved to ' + bedcov_output_fpath) else: err('Error, result is non-existent or empty')
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names): fastq_by_sn = OrderedDict() for sn in sample_names: sn_fastq_fpaths = sorted( [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')]) if len(sn_fastq_fpaths) == 0: err('Error: no fastq found for ' + sn) fastq_by_sn[sn] = None elif len(sn_fastq_fpaths) > 2: critical('Error: more than 2 fastq files starting with ' + sn + '_R: ' + ', '.join(sn_fastq_fpaths)) elif len(sn_fastq_fpaths) == 1: warn('Warning: only single fastq file is found for ' + sn + '. Treating as single reads.') fastq_by_sn[sn] = [ verify_file(sn_fastq_fpaths[0], description='sn_fastq_fpaths[0] for ' + str(sn)), None ] else: fastq_by_sn[sn] = [ verify_file(fpath, description='fpath from sn_fastq_fpaths for ' + str(sn)) for fpath in sn_fastq_fpaths ] return fastq_by_sn
def check_genome_resources(cnf): if cnf.genome is None: critical('Please, specify genome build (one of available in ' + cnf.sys_cnf + ') using the --genome option (e.g., --genome hg38).') if not cnf.genomes: critical('"genomes" section is not specified in system config ' + cnf.sys_cnf) info('Genome: ' + str(cnf.genome.name)) for key in cnf.genome.keys(): if key != 'name' and isinstance(cnf.genome[key], basestring): cnf.genome[key] = adjust_system_path(cnf.genome[key]) if not verify_obj_by_path(cnf.genome[key], key, silent=True): if not cnf.genome[key].endswith('.gz') and verify_file( cnf.genome[key] + '.gz', silent=True): gz_fpath = cnf.genome[key] + '.gz' if verify_file(gz_fpath, silent=True): cnf.genome[key] = gz_fpath if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds: warn( 'Warning: features and bed_annotation_features and cds in the system config (' + cnf.sys_cnf + ') must be specified.') if not cnf.transcripts_fpath: cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts( cnf.genome.name, ensembl=True)
def submit_job(cnf, cmdline, job_name, wait_for_steps=None, threads=1, output_fpath=None, stdout_to_outputfile=True, run_on_chara=False, **kwargs): prefix = str(cnf.project_name) + '_' if job_name: prefix += job_name + '_' prefix += datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '_' f, done_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.done') f, error_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.error') if isfile(done_marker_fpath): os.remove(done_marker_fpath) if isfile(error_marker_fpath): os.remove(error_marker_fpath) job_id = basename(splitext(done_marker_fpath)[0]) tx_output_fpath = None if output_fpath: if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') j = JobRunning(None, None, None, None, None, output_fpath=output_fpath, **kwargs) j.is_done = True return j if stdout_to_outputfile: tx_output_fpath = output_fpath + '.tx' if isfile(tx_output_fpath): os.remove(tx_output_fpath) cmdline += ' > ' + tx_output_fpath else: if isfile(output_fpath): os.remove(output_fpath) qsub = get_system_path(cnf, 'qsub', is_critical=True) bash = get_system_path(cnf, 'bash', is_critical=True) if cnf.log_dir: err_fpath = log_fpath = join(cnf.log_dir, job_id + '.log') else: fd, fpath = make_tmpfile(cnf, suffix=job_id + '.log', text=True) err_fpath = log_fpath = fpath queue = cnf.queue runner_script = adjust_system_path(cnf.qsub_runner) verify_file(runner_script, is_critical=True, description='qsub_runner') hold_jid_line = '-hold_jid ' + ','.join(wait_for_steps or ['_']) mem = threads * 15 priority = 0 if cnf.qsub_priority: priority = cnf.qsub_priority extra_qsub_opts = '' if run_on_chara and is_us(): extra_qsub_opts += '-l h="chara|rask"' cmdline = cmdline.replace('"', '\\"').replace('\\\\"', '\\"') qsub_cmdline = ( '{qsub} -pe smp {threads} {extra_qsub_opts} -S {bash} -q {queue} -p {priority} ' '-j n -o {log_fpath} -e {err_fpath} {hold_jid_line} ' '-N {job_id} {runner_script} {done_marker_fpath} {error_marker_fpath} "{cmdline}"'.format(**locals())) info('Submitting job ' + job_id) info(qsub_cmdline) job = JobRunning(job_id, log_fpath, qsub_cmdline, done_marker_fpath, error_marker_fpath, output_fpath=output_fpath, tx_output_fpath=tx_output_fpath, stdout_to_outputfile=stdout_to_outputfile, **kwargs) call(cnf, qsub_cmdline, silent=True) return job
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def calculate_coverage_use_grid(cnf, samples, output_dirpath): assert len(samples) > 0 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) chr_len_fpath = get_chr_len_fpath(cnf) jobs_to_wait = [] for sample in samples: sample_output_dirpath = join(output_dirpath, sample.name) safe_mkdir(sample_output_dirpath) for chrom in chromosomes: info('Processing chromosome ' + chrom) avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz') sample_output_fpaths = [ join(output_dirpath, sample.name, chrom + '.txt.gz') for sample in samples ] sample_names = ','.join(sample.name for sample in samples) chrom_bams = [] for sample in samples: if not verify_file(sample.bam): err('BAM for ' + sample.name + ' is not exist!') continue output_bam_fpath = join( cnf.work_dir, basename(sample.name) + '_' + str(chrom) + '.bam') cmdline = '{sambamba} slice {sample.bam} {chrom}'.format( **locals()) call(cnf, cmdline, output_fpath=output_bam_fpath) if verify_file(output_bam_fpath): chrom_bams.append(output_bam_fpath) bam_fpaths = ','.join(chrom_bams) if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \ all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths): info(avg_cov_output_fpath + ' exists, reusing') else: j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath) if j and not j.is_done: jobs_to_wait.append(j) info() if len(jobs_to_wait) >= cnf.threads: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) jobs_to_wait = [] elif not jobs_to_wait: info('No jobs to submit.') if jobs_to_wait: wait_for_jobs(cnf, jobs_to_wait)
def determine_sys_cnf(opts): if 'sys_cnf' in opts.__dict__ and opts.sys_cnf: return verify_file(opts.sys_cnf, is_critical=True) else: opts.__dict__['sys_cnf'] = verify_file(detect_sys_cnf_by_location(), is_critical=True) debug('Using system configuration ' + opts.sys_cnf) return opts.sys_cnf
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath), Loader=Loader) except: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def add_project_files_to_jbrowse(cnf, bcbio_structure): genome = cnf.genome.name jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, bcbio_structure.project_name) safe_mkdir(jbrowse_project_dirpath) jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf') vcf_fpath_by_sample = None caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if caller: vcf_fpath_by_sample = caller.get_filt_vcf_by_sample() for sample in bcbio_structure.samples: if sample.bam: index_bam(cnf, sample.bam, use_grid=True) for sample in bcbio_structure.samples: if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\ and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample): info(sample.name + ' was exported to jBrowse previously.') continue vcf_link = None if vcf_fpath_by_sample: vcf_fpath = vcf_fpath_by_sample[ sample.name] if sample.name in vcf_fpath_by_sample else None if vcf_fpath and verify_file(vcf_fpath): vcf_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath) if not verify_file(vcf_fpath + '.tbi'): cmdline = '{tabix} {vcf_fpath}'.format(**locals()) call(cnf, cmdline, exit_on_error=False) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath + '.tbi') if sample.bam: bam_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam + '.bai') bigwig_link = create_jbrowse_symlink( genome, bcbio_structure.project_name, sample.name, splitext(sample.bam)[0] + '.bigwig') print_sample_tracks_info(sample.name, bcbio_structure.project_name, trunc_symlink(bam_link), trunc_symlink(bigwig_link), trunc_symlink(vcf_link), jbrowse_tracks_fpath)
def save_regions_to_bed(cnf, regions, bed_fpath, save_original_fields=False): if isfile(bed_fpath): if cnf.reuse_intermediate: verify_file(bed_fpath, is_critical=True) return bed_fpath else: os.remove(bed_fpath) with file_transaction(cnf.work_dir, bed_fpath) as tx_fpath: save_regions_to_bed_nocnf(regions, tx_fpath, save_original_fields) return bed_fpath
def main(args): if len(args) < 2: critical('Usage: ' + __file__ + ' InputRootDirectory OutputRootDirectory [Build=hg38]') sys.exit(1) inp_root = adjust_path(args[0]) out_root = adjust_path(args[1]) build = 'hg38' if len(args) >= 3: build = args[2] chain_fpath = chains[build.lower()] for inp_dirpath, subdirs, files in os.walk(inp_root): for fname in files: if fname == 'sample1-cn_mops.bed': pass if fname.endswith('.bed'): inp_fpath = adjust_path(join(inp_dirpath, fname)) print inp_fpath + ': ' + str( count_bed_cols(inp_fpath)) + ' columns' out_dirpath = adjust_path( join(out_root, relpath(inp_dirpath, inp_root))) safe_mkdir(out_dirpath) out_fpath = adjust_path(join(out_dirpath, fname)) unlifted_fpath = adjust_path( join(out_dirpath, fname + '.unlifted')) cmdline = '' with open(inp_fpath) as f: fs = f.readline().split('\t') try: int(fs[6]) int(fs[7]) except: info('Cutting ' + inp_fpath) cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; ' cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"' cmdline = cmdline.format(**locals()) info(cmdline) os.system(cmdline) verify_file(out_fpath) if isfile(unlifted_fpath): if getsize(unlifted_fpath) <= 0: os.remove(unlifted_fpath) else: err('Some records were unlifted and saved to ' + unlifted_fpath)
def determine_run_cnf(opts, is_wgs=False, is_targetseq=False): if opts.run_cnf: opts.run_cnf = adjust_path(opts.run_cnf) elif is_wgs: opts.run_cnf = defaults['run_cnf_wgs'] elif is_targetseq: opts.run_cnf = defaults['run_cnf_deep_seq'] else: opts.run_cnf = defaults['run_cnf_exome_seq'] verify_file(opts.run_cnf, is_critical=True) debug('Using run configuration ' + opts.run_cnf) return opts.run_cnf
def process_one(cnf, output_dir, bam_fpath, features_bed, features_no_genes_bed): sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam) sample.l_fpath = cnf.l_fpath sample.r_fpath = cnf.r_fpath # if not sample.bam and sample.l_fpath and sample.r_fpath: # sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath)) info('Using alignment ' + sample.bam) if not bam_fpath: critical(sample.name + ': BAM file is required.') target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None bam_fpath = verify_file(sample.bam, is_critical=True) index_bam(cnf, bam_fpath) gene_keys_list = None if cnf.prep_bed is not False: info('Preparing the BED file.') features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds( cnf, features_bed, target_bed) gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) else: info('The BED file is ready, skipping preparing.') gene_keys_set, gene_keys_list, _, _, _ = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) picard_ins_size_hist(cnf, sample, bam_fpath, output_dir) avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports( cnf, output_dir, sample, bam_fpath, features_bed, features_no_genes_bed, target_bed, gene_keys_list) # #if cnf.extended: # try: # info('Generating flagged regions report...') # flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom) # if not flagged_report: # err('Flagged regions report was not generated') # err() # except: # err(format_exc()) return reports
def _make_tarqc_html_report(cnf, output_dir, samples, bed_fpath=None, tag_by_sample=None): header_storage = get_header_metric_storage( cnf.coverage_reports.depth_thresholds, is_wgs=bed_fpath is not None, padding=cnf.coverage_reports.padding) jsons_by_sample = { s.name: s.targetcov_json_fpath for s in samples if verify_file(s.targetcov_json_fpath) } htmls_by_sample = dict( ) #{s.name: s.targetcov_html_fpath for s in samples if verify_file(s.targetcov_html_fpath)} if not jsons_by_sample: return None, None, None targqc_full_report = FullReport.construct_from_sample_report_jsons( samples, output_dir, jsons_by_sample, htmls_by_sample) for sample_report in targqc_full_report.sample_reports: if tag_by_sample: sample_report.set_project_tag( tag_by_sample[sample_report.sample.name]) if verify_file(sample_report.sample.qualimap_html_fpath): url = relpath(sample_report.sample.qualimap_html_fpath, output_dir) r = sample_report.find_record(sample_report.records, 'Qualimap') if r: r.url = url else: sample_report.add_record(metric_name='Qualimap', value='Qualimap', url=url, silent=True) _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report) txt_fpath = targqc_full_report.save_txt( join(output_dir, BCBioStructure.targqc_name + '.txt')) tsv_fpath = targqc_full_report.save_tsv( join(output_dir, BCBioStructure.targqc_name + '.tsv')) html_fpath = targqc_full_report.save_html( cnf, join(output_dir, BCBioStructure.targqc_name + '.html'), 'TargQC') return txt_fpath, tsv_fpath, html_fpath
def _intersect_with_tricky_regions(cnf, selected_bed_fpath, sample): info() info('Detecting problematic regions for ' + sample) bed_filenames = [fn + '.bed.gz' for fn in tricky_regions_fnames_d.keys()] merged_bed_fpaths = [ join(cnf.genome.tricky_regions, 'merged', bed_filename) for bed_filename in bed_filenames ] info('Intersecting BED ' + selected_bed_fpath + ' using BED files with tricky regions') intersection_fpath = join( cnf.work_dir, splitext_plus(basename(selected_bed_fpath))[0] + '_tricky_vcf_bed.intersect') if not cnf.reuse_intermediate or not verify_file( intersection_fpath, silent=True, is_critical=False): bedtools = get_system_path(cnf, 'bedtools') cmdline = bedtools + ' intersect -header -a ' + selected_bed_fpath + ' -b ' + ' '.join( merged_bed_fpaths) + ' -wo -filenames' call(cnf, cmdline, output_fpath=intersection_fpath, exit_on_error=False) return intersection_fpath
def _get_gene_transcripts_id(cnf): genes_dict = dict() transcripts_dict = dict() if not cnf.genome.all_transcripts: critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.') if not verify_file(cnf.genome.all_transcripts): critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.') info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts) with open_gzipsafe(cnf.genome.all_transcripts) as f: for i, l in enumerate(f): if l.startswith('#'): continue chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') if feature != 'transcript': continue try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) transcript_id = _rm_quotes(_prop_dict['transcript_id']) #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id) genes_dict[gene_id] = gene_symbol transcripts_dict[transcript_id] = gene_symbol return genes_dict, transcripts_dict
def leave_main_sample(cnf, vcf_fpath, samplename): index = get_sample_column_index(vcf_fpath, samplename) if index is None: return vcf_fpath # def _f1(rec): # rec.samples = [sample_name] # return rec # info('Keeping SAMPLE only for the first sample (' + samplename + ')') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name) # out_fpath = extract_sample(cnf, vcf_fpath, sample_name) # info() def _f(line, i): if line and (line.startswith('#CHROM') or line[0] != '#'): ts = line.split('\t') return '\t'.join(ts[:9] + [ts[9 + index]]) return line vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm') if not verify_file(vcf_fpath): err('Error: leave_first_sample didnt generate output file.') return None return vcf_fpath
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath): info('WGS; running vcftxt separately for each sample to save memory.') vcf2txt_outputs_by_vcf_fpath = OrderedDict() for vcf_fpath in vcf_fpath_by_sample.values(): sample_output_fpath = add_suffix(vcf2txt_out_fpath, splitext(basename(vcf_fpath))[0]) vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath info() info('Joining vcf2txt ouputs... (' + str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' + str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' + vcf2txt_out_fpath) with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx: with open(tx, 'w') as out: for i, (vcf_fpath, sample_output_fpath) in enumerate( vcf2txt_outputs_by_vcf_fpath.items()): info(' Reading ' + sample_output_fpath) with open(sample_output_fpath) as inp: for j, l in enumerate(inp): if j == 0 and i != 0: continue out.write(l) if verify_file(vcf2txt_out_fpath): info('Saved ' + vcf2txt_out_fpath) return vcf2txt_out_fpath else: return None
def verify_vcf(vcf_fpath, silent=False, is_critical=False): if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical): return None debug('File ' + vcf_fpath + ' exists and not empty') vcf = open_gzipsafe(vcf_fpath) debug('File ' + vcf_fpath + ' opened') l = next(vcf, None) if l is None: (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath) return None if not l.startswith('##fileformat=VCF'): (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath) return None try: reader = vcf_parser.Reader(vcf) except: err('Error: cannot open the VCF file ' + vcf_fpath) if is_critical: raise else: debug('File ' + vcf_fpath + ' opened as VCF') try: rec = next(reader) except IndexError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('IndexError parsing VCF file ' + vcf_fpath) if is_critical: raise except ValueError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('ValueError parsing VCF file ' + vcf_fpath) if is_critical: raise except StopIteration: debug('No records in the VCF file ' + vcf_fpath) if not silent: warn('VCF file ' + vcf_fpath + ' has no records.') return vcf_fpath except: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('Other error parsing VCF file ' + vcf_fpath) if is_critical: raise else: debug('A record was read from the VCF file ' + vcf_fpath) return vcf_fpath # f = open_gzipsafe(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open_gzipsafe(output_fpath) # contents = f.read() # if not silent: # if is_critical: # critical('SnpSift failed with memory issue:\n' + contents) # else: # err('SnpSift failed with memory issue:\n' + contents) # return None # f.close() # return None # return output_fpath finally: vcf.close()
def _correct_qualimap_genome_results(cnf, samples): """ fixing java.lang.Double.parseDouble error on entries like "6,082.49" """ for s in samples: if verify_file(s.qualimap_genome_results_fpath): correction_is_needed = False with open(s.qualimap_genome_results_fpath, 'r') as f: content = f.readlines() metrics_started = False for line in content: if ">> Reference" in line: metrics_started = True if metrics_started: if line.find(',') != -1: correction_is_needed = True break if correction_is_needed: with open(s.qualimap_genome_results_fpath, 'w') as f: metrics_started = False for line in content: if ">> Reference" in line: metrics_started = True if metrics_started: if line.find(',') != -1: line = line.replace(',', '') f.write(line)
def split_bams(cnf, samples, vcf_fpath): variants_by_chrom = parse_variants(vcf_fpath) temp_output_dirpath = join(cnf.work_dir, 'temp') safe_mkdir(temp_output_dirpath) info('Splitting BAM files...') for chrom, variants in variants_by_chrom.iteritems(): chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths) chr_length = chr_lengths_dict[chrom] transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom) bams_created_before = [] bams_by_sample = defaultdict(list) info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants') for variant in variants: variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before) bams_created_before.extend(variant_bams_by_sample.values()) for sample_name, bam_fpath in variant_bams_by_sample.iteritems(): bams_by_sample[sample_name].append(bam_fpath) chrom = chrom.replace('chr', '') info() for sample_name, bam_fpaths in bams_by_sample.iteritems(): info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name) bam_fname = '{chrom}-{sample_name}.bam'.format(**locals()) temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname) combined_bam_fpath = join(cnf.output_dir, bam_fname) generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath) info() info('Removing BAM files...') shutil.rmtree(temp_output_dirpath, ignore_errors=True)
def make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample): c = cnf.variant_filtering min_freq = c.act_min_freq cmdline = \ '-r 1.0 -R 1.0 -P {c.filt_p_mean} -Q {c.filt_q_mean} -D {c.filt_depth} -V {c.min_vd} ' \ '-f {min_freq} -p {c.min_p_mean} -q {c.min_q_mean} ' \ '-M {c.min_mq} -o {c.signal_noise} -L'.format(**locals()) if c.bias: cmdline += ' -b ' dbsnp_multi_mafs = cnf.genome.dbsnp_multi_mafs if dbsnp_multi_mafs and verify_file(dbsnp_multi_mafs): cmdline += ' -A ' + dbsnp_multi_mafs else: cmdline += ' -A ""' if c.amplicon_based: cmdline += ' -a ' # corr_vcf_fpath_by_sample = dict() # for sn, vcf_fpath in vcf_fpath_by_sample.items(): # ungz = vcf_fpath # if vcf_fpath.endswith('.gz'): # ungz = splitext(vcf_fpath)[0] # call(cnf, 'gunzip ' + vcf_fpath, output_fpath=ungz) # corr_vcf_fpath_by_sample[sn] = ungz cmdline += ' ' + ' '.join(vcf_fpath_by_sample.values()) return cmdline
def _get_depth_for_each_variant(cnf, var_by_site, clipped_gz_vcf_fpath, bed_fpath, bam_fpath): # http://www.1000genomes.org/faq/what-depth-coverage-your-phase1-variants # bedtools intersect -a oncomine.vcf -b Exons.az_key.bed -header > oncomine.az_key.vcf # /opt/az/local/tabix/tabix-0.2.6/bgzip oncomine.az_key.vcf # /opt/az/local/tabix/tabix-0.2.6/tabix -h -p vcf oncomine.az_key.vcf.gz # samtools view -b TRF004223.sorted.bam -L Exons.az_key.bed | bedtools genomecov -ibam stdin -bg > coverage.bg # bedtools intersect -a oncomine.az_key.vcf.gz -b coverage.bg -wa | cut -f1,2,4,5,8,11,12,13,14 > oncomine.az_key.depth_numbers.vcf sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bedtools = get_system_path(cnf, 'bedtools') info() info('Depth of coverage for regions in BED ' + bed_fpath) cov_bg = join(cnf.work_dir, 'coverage.bg') cmdline = '{sambamba} view -f bam -t {cnf.threads} -L {bed_fpath} {bam_fpath} | {bedtools} genomecov -ibam stdin -bg'.format( **locals()) call(cnf, cmdline, output_fpath=cov_bg, exit_on_error=False) info() info('Intersecting depth regions with VCF ' + clipped_gz_vcf_fpath) vcf_depth_numbers_fpath = join(cnf.work_dir, 'vcf_bg.intersect') if not cnf.reuse_intermediate or not verify_file( vcf_depth_numbers_fpath, silent=True, is_critical=False): cmdline = '{bedtools} intersect -a {clipped_gz_vcf_fpath} -b {cov_bg} -wao'.format( **locals()) res = call(cnf, cmdline, output_fpath=vcf_depth_numbers_fpath, exit_on_error=False) # if res != oncomine_depth_numbers_fpath: # info() # info('Trying with uncompressed VCF') # cmdline = 'gunzip {vcf_fpath} -c | {bedtools} intersect -a - -b {cov_bg} -wao | cut -f1,2,4,5,8,11,12,13,14,15'.format(**locals()) # call(cnf, cmdline, output_fpath=oncomine_depth_numbers_fpath) depths_per_var = defaultdict(list) with open(vcf_depth_numbers_fpath) as f: for l in f: # 1,2,4,5,8,11,12,13,14,15,16,17,18,19,20 # c,p,r,a,f,ch,st,en,ge,ex,st,ft,bt,de,ov fs = l.replace('\n', '').split('\t') chrom, pos, _, ref, alt = fs[:5] depth, overlap = fs[-2:] var = var_by_site.get((chrom, pos, ref, alt)) if var and depth != '.': depth, overlap = int(depth), int(overlap) for i in range(overlap): depths_per_var[(chrom, pos, ref, alt)].append(depth) # Getting average depth of coverage of each variant (exactly for those parts that were in BED) depth_by_var = { var: (sum(depths) / len(depths)) if len(depths) != 0 else None for var, depths in depths_per_var.iteritems() } return depth_by_var