def main(): cnf, samples, bed_fpath, output_dir = proc_args(sys.argv) info('Processing ' + str(len(samples)) + ' samples') if cnf.prep_bed is not False: if not bed_fpath: info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds)) bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name) seq2c_bed_fname = basename(bed_fpath) bed_cols = count_bed_cols(bed_fpath) if bed_cols < 4: check_genome_resources(cnf) _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath) try: copyfile(bed_fpath, join(output_dir, seq2c_bed_fname)) except OSError: err(format_exc()) info() else: info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname)) bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file') info('Using target ' + bed_fpath) run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def _verify_input_file(_key): cnf[_key] = adjust_path(cnf[_key]) if not verify_file(cnf[_key], _key): return False if 'bam' in _key and not verify_bam(cnf[_key]): return False if 'bed' in _key and not verify_bed(cnf[_key]): return False return True
def run_seq2c_bcbio_structure(cnf, bcbio_structure): step_greetings('Coverage statistics for each gene for all samples') if cnf.prep_bed is not False: info('Preparing BED files') features_bed_fpath = cnf.features or cnf.genome.features # only for annotation if cnf.bed or bcbio_structure.bed: _, _, _, seq2c_bed = \ prepare_beds(cnf, features_bed=features_bed_fpath, target_bed=bcbio_structure.bed, seq2c_bed=bcbio_structure.sv_bed) else: seq2c_bed = verify_bed(cnf.genome.cds) else: seq2c_bed = verify_bed(cnf.bed) info('Calculating normalized coverages for CNV...') cnv_report_fpath = run_seq2c( cnf, join(bcbio_structure.date_dirpath, BCBioStructure.cnv_dir), bcbio_structure.samples, seq2c_bed, is_wgs=cnf.is_wgs) # if not verify_module('matplotlib'): # warn('No matplotlib, skipping plotting Seq2C') # else: # Parallel(n_jobs=cnf.threads) \ # (delayed(draw_seq2c_plot)(CallCnf(cnf.__dict__), cnv_report_fpath, s.name, # cnf.output_dir, chr_lens=get_chr_lengths(cnf)) # for s in bcbio_structure.samples) # # for s in bcbio_structure.samples: # plot_fpath = draw_seq2c_plot(cnf, cnv_report_fpath, s.name, cnf.output_dir) info() info('*' * 70) if cnv_report_fpath: info('Seq2C:') if cnv_report_fpath: info(' ' + cnv_report_fpath) return [cnv_report_fpath]
def get_bed_targqc_inputs(cnf, bed_fpath=None): if bed_fpath: bed_fpath = verify_bed(bed_fpath, description='Input BED file', is_critical=True) info('Using amplicons/capture panel ' + bed_fpath) features_bed_fpath = adjust_path(cnf.features or cnf.genome.features) if features_bed_fpath: info('Features: ' + features_bed_fpath) genes_fpath = None if cnf.genes: genes_fpath = adjust_path(cnf.genes) info('Custom genes list: ' + genes_fpath) return bed_fpath, features_bed_fpath, genes_fpath
def main(): parser = OptionParser(usage='Usage: ' + basename(__file__) + ' -o Output_BED_file -g hg19 Input_BED_file') parser.add_option('-o', '--output-bed', dest='output_fpath') parser.add_option('-g', '--genome', dest='genome') (opts, args) = parser.parse_args(sys.argv[1:]) if len(args) < 1: parser.print_help(file=sys.stderr) sys.exit(1) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) check_genome_resources(cnf) if not cnf.output_fpath: critical(parser.usage) sort_bed(cnf, verify_bed(args[0], is_critical=True), adjust_path(cnf.output_fpath))
def main(): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input.' parser = OptionParser(description=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) parser.add_option('--work-dir', dest='work_dir', metavar='DIR') parser.add_option('--log-dir', dest='log_dir') parser.add_option('--only-summary', dest='only_summary', action='store_true') parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'targetqc')) parser.add_option('--reannotate', dest='reannotate', action='store_true', default=False, help='re-annotate BED file with gene names') parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='count duplicates in coverage metrics') parser.add_option('--bed', dest='bed', help='BED file to run targetSeq and Seq2C analysis on.') parser.add_option( '--exons', '--exome', '--features', dest='features', help= 'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.' ) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: critical('No BAMs provided to input.') bam_fpaths = list(set([abspath(a) for a in args])) bad_bam_fpaths = [] for fpath in bam_fpaths: if not verify_bam(fpath): bad_bam_fpaths.append(fpath) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths)) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'TargQC' set_up_dirs(cnf) # cnf.name = 'TargQC_' + cnf.project_name check_genome_resources(cnf) verify_bed(cnf.bed, is_critical=True) bed_fpath = adjust_path(cnf.bed) info('Using amplicons/capture panel ' + bed_fpath) features_bed_fpath = adjust_path( cnf.features) if cnf.features else adjust_path(cnf.genome.features) info('Features: ' + features_bed_fpath) genes_fpath = None if cnf.genes: genes_fpath = adjust_path(cnf.genes) info('Custom genes list: ' + genes_fpath) if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) info('*' * 70) info() targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath, features_bed_fpath, genes_fpath) if targqc_html_fpath: send_email( cnf, 'TargQC report for ' + cnf.project_name + ':\n ' + targqc_html_fpath)
def main(): if len(sys.argv[1]) < 0: critical('Usage: ' + __file__ + ' Input_BED_file -g hg19 -o Annotated_BED_file') input_bed_fpath = verify_bed(sys.argv[1], is_critical=True, description='Input BED file for ' + __file__) cnf = read_opts_and_cnfs( description= 'Annotating BED file based on reference features annotations.', extra_opts=[ (['--reference'], dict(dest='reference')), ], required_keys=['output_file'], file_keys=['reference'], key_for_sample_name=None, fpath_for_sample_name=input_bed_fpath, main_output_is_file=True) check_system_resources(cnf) check_genome_resources(cnf) chr_order = get_chrom_order(cnf) features_fpath = adjust_path(cnf.genome.bed_annotation_features) if not verify_bed(features_fpath, 'Annotated reference BED file'): critical('Annotated reference is required') # features_and_beds = _split_reference_by_priority(cnf, features_fpath) bed = BedTool(input_bed_fpath).cut([0, 1, 2]) info() annotated = None off_targets = None for feature in ['CDS', 'Exon', 'Transcript', 'Gene']: if bed: info('Extracting ' + feature + ' features from ' + features_fpath) features_bed = BedTool(features_fpath).filter( lambda x: x[6] == feature) info('Annotating based on ' + feature) new_annotated, off_targets = _annotate(cnf, bed, features_bed, chr_order) if not annotated: annotated = new_annotated for a in annotated: a.feature = feature else: annotated.extend(new_annotated) if off_targets: bed = BedTool([(r.chrom, r.start, r.end) for r in off_targets]) # off_target_fpath = _save_regions(off_targets, join(work_dirpath, 'off_target_1.bed')) # log('Saved off target1 to ' + str(off_target_fpath)) info() if annotated is not None and off_targets is not None: annotated.extend(off_targets) info() info('Saving annotated regions to ' + str(cnf.output_file)) with open(cnf.output_file, 'w') as out: for region in sorted(annotated, key=lambda r: r.get_key()): out.write(str(region)) # for r, overlap_size in overlaps: # sys.stdout.write('\t' + '\t'.join([ # r.chrom, '{:,}'.format(r.start), '{:,}'.format(r.end), r.gene, r.exon, str(r.strand), r.feature, r.biotype, # str(overlap_size), # '{:.2f}%'.format(100.0 * overlap_size / (r.end - r.start)) # ])) # sys.stdout.write('\n') info('Done.')