def main():
    info(' '.join(sys.argv))
    info()

    parser = OptionParser(usage='Usage: ' + basename(__file__) + ' --chr chr --vcf VCF_file --samples Sample1,Sample2 '
                                                                 '--bams BAM_file1,BAM_file2 -o Output_directory '
                                                                 '--features BED_file')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--features', dest='features', help='BED file with real CDS/Exon/Gene/Transcript regions with '
                                                          'annotations (default "features" is in system_config)')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    cnf.verbose = False

    if not cnf.output_dir or not cnf.vcf_fpath or not cnf.chrom:
        critical(parser.usage)

    cnf.features = cnf.features or cnf.genome.features
    samples = [BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))]
    split_bams(cnf, samples, cnf.vcf_fpath)
    info('Done.')
Exemplo n.º 2
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Exemplo n.º 3
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=None,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True)
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        sample_name = cnf.sample_name
        if not sample_name:
            sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath)
        results_dirpath = run_fastq(cnf,
                                    sample_name,
                                    left_reads_fpath,
                                    right_reads_fpath,
                                    output_dirpath,
                                    downsample_to=cnf.downsample_to)

    verify_dir(results_dirpath, is_critical=True)
    info()
    info('*' * 70)
    info('Fastqc results:')
    info('  ' + results_dirpath)
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int',
        help='Downsample reads to avoid excessive processing times with large files. '
            'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()

    if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir:
        parser.print_usage()

    verify_file(opts.left_reads_fpath, is_critical=False)
    left_reads_fpath = adjust_path(opts.left_reads_fpath)
    verify_file(opts.right_reads_fpath, is_critical=False)
    right_reads_fpath = adjust_path(opts.right_reads_fpath)
    output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True)

    left_reads_fpath, right_reads_fpath, output_dirpath =\
        map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath])

    ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******')
    fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py')
    fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM)
    fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM)

    cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}'
    if opts.sample_name:
        cmdl += ' --sample {opts.sample_name}'
    if opts.downsample_to:
        cmdl += ' --downsample-to ' + str(int(opts.downsample_to))
    cmdl = cmdl.format(**locals())
    cmdl += ' 2>&1'
    info(cmdl)
    stdin, stdout, stderr = ssh.exec_command(cmdl)
    for l in stdout:
        err(l, ending='')
    info()
    ssh.close()
Exemplo n.º 5
0
def get_args():
    description = (
        'Plots a Circos plot given vardict variant file (with all dbSNP SNPs, not the PASS one), '
        'Seq2C CNV calls and Manta SVs.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--bed', dest='bed_fpath', help='Path to BED file')
    parser.add_option('-v', '--mutations', dest='mutations_fpath', help='Path to VarDict.txt file')
    parser.add_option('-c', '--seq2c', dest='seq2c_tsv_fpath', help='Path to seq2c copy number file')
    parser.add_option('--sv', dest='sv_fpath', help='Path to Manta SV call vcf.gz file')
    parser.add_option('-s', '--sample', dest='sample', help='Identifier of sample in VarDict and Seq2c files')
    parser.add_option('-o', '--output-dir', dest='output_dir', default="./",
                        help='Output directory. Defaults to ./')
    (opts, args) = parser.parse_args()
    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    return cnf
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script converts Vardict TXT file to VCF.'

    parser = OptionParser(
        description=description,
        usage='Usage: ' + basename(__file__) +
        ' [-o Output_directory -c Var_caller_name] Project_directory')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir', default='-')
    parser.add_option('-c', '--caller', dest='caller_name', default='vardict')
    parser.add_option('-o', dest='output_dir', help='Output directory.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)

    if not bcbio_project_dirpaths:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf,
                            final_dirpath)
        bcbio_structures.append(bs)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    info('')
    info('*' * 70)
    for bs in bcbio_structures:
        for sample in bs.samples:
            if sample.phenotype != 'normal':
                convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample)
Exemplo n.º 7
0
def proc_opts():
    parser = OptionParser()
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--expose-only',
                      dest='expose_to_ngs_server_only',
                      action='store_true',
                      default=False,
                      help='Only add project to the webserver')
    parser.add_option('--no-expose',
                      dest='expose',
                      action='store_false',
                      default=True,
                      help='Do not expose the reports')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option('--downsample-to', dest='downsample_to', type='int')

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) < 1:
        critical('Usage: ' + __file__ + ' *.fq.gz -o output_dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]

    fastq_fpaths = [verify_file(fpath) for fpath in args]
    fastq_fpaths = [fpath for fpath in fastq_fpaths if fpath]
    info(str(len(fastq_fpaths)) + ' fastq files')

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    info('Writing to ' + str(cnf.output_dir))

    cnf.project_name = cnf.project_name or 'preproc'

    if cnf.work_dir:
        cnf.debug = True
    else:
        all_work_dir = join(cnf.output_dir, 'work')
        safe_mkdir(all_work_dir)

        latest_fpath = join(all_work_dir, 'latest')

        if cnf.reuse_intermediate:
            cnf.work_dir = latest_fpath
        else:
            cnf.work_dir = join(
                all_work_dir,
                datetime.datetime.now().strftime("%Y-%b-%d_%H-%M"))
            if islink(latest_fpath):
                os.remove(latest_fpath)
            if isdir(latest_fpath):
                shutil.rmtree(latest_fpath)
            if not exists(latest_fpath):
                os.symlink(basename(cnf.work_dir), latest_fpath)

    cnf.work_dir = adjust_path(cnf.work_dir)
    safe_mkdir(cnf.work_dir)
    cnf.log_dir = join(cnf.work_dir, 'log')
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf)
    try:
        subprocess.call(['chmod', '-R', 'g+w', cnf.work_dir])
    except OSError:
        err(traceback.format_exc())
        pass

    if cnf.samplesheet:
        cnf.samplesheet = verify_file(cnf.samplesheet, is_critical=True)

    info(' '.join(sys.argv))
    info()
    info('Created a temporary working directory: ' + cnf.work_dir)

    if cnf.project_name:
        info('Project name: ' + cnf.project_name)

    if cnf.samplesheet:
        info('Using custom sample sheet ' + cnf.samplesheet)

    check_genome_resources(cnf)
    check_system_resources(cnf, optional=['fastq'])

    return cnf, cnf.output_dir, fastq_fpaths
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script evaluate capture target.'

    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)

    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--exac-only-filtering',
                      dest='prepare_for_exac',
                      action='store_true',
                      default=False,
                      help='Export filtered regions to ExAC browser.')
    parser.add_option('--exac',
                      dest='add_to_exac',
                      action='store_true',
                      default=False,
                      help='Export coverage data to ExAC browser.')
    parser.add_option('--bed',
                      '--capture',
                      '--amplicons',
                      dest='bed',
                      help='BED file to overlap.')
    parser.add_option(
        '--tricky-regions',
        dest='tricky_regions',
        action='store_true',
        default=False,
        help='Use high GC, low GC, low complexity regions to overlap.')
    parser.add_option('--min-percent',
                      dest='min_percent',
                      default='0.5',
                      help='Minimal percent of region which has low coverage.')
    parser.add_option(
        '--min-ratio',
        dest='min_ratio',
        default='0.5',
        help='Minimal percent of samples which share the same feature.')
    parser.add_option('--min-depth',
                      dest='min_depth',
                      help='Coverage threshold.')
    parser.add_option('--metadata',
                      dest='metadata',
                      help='Samples type for each project '
                      '(plasma, cell_line, ffpe, deepseq, exome, wgs).')
    parser.add_option('-o', dest='output_dir', help='Output directory.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)

    if not cnf.project_name:
        cnf.add_to_exac = False
        cnf.project_name = 'CaptureTargetEvaluation'

    if cnf.prepare_for_exac:
        cnf.output_dir = join(get_exac_dir(cnf), 'coverage', cnf.project_name)
    elif cnf.output_dir is None:
        cnf.output_dir = join(os.getcwd(), cnf.project_name)

    cnf.output_dir = safe_mkdir(adjust_path(cnf.output_dir))
    cnf.work_dir = safe_mkdir(join(cnf.output_dir, 'work'))
    cnf.log_dir = safe_mkdir(join(cnf.work_dir), 'log')

    cnf.min_percent = 1 - float(cnf.min_percent)
    cnf.min_ratio = float(cnf.min_ratio)
    if cnf.min_depth:
        cnf.min_depth = int(cnf.min_depth)
    if cnf.metadata:
        cov_thresholds = {
            'deepseq': 250,
            'plasma': 100,
            'exome': 20,
            'ffpe': 10,
            'cell_line': 10,
            'wgs': 10
        }
        cnf.min_depths = [
            cov_thresholds[type] for type in cnf.metadata.split(',')
        ]

    if len(bcbio_project_dirpaths) < 1:
        critical('Usage: ' + __file__ +
                 ' project_bcbio_path [project_bcbio_path] [-o output_dir]')

    info()
    info('*' * 70)

    safe_mkdir(cnf.output_dir)

    if cnf.log_dir:
        info('log_dirpath: ' + cnf.log_dir)
        safe_mkdir(cnf.log_dir)
        set_up_log(cnf, 'evaluate_capture_target', cnf.project_name,
                   cnf.output_dir)

    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf,
                            final_dirpath)
        bcbio_structures.append(bs)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    info('')
    info('*' * 70)
    regions_fpath = evaluate_capture(cnf, bcbio_structures)
    if cnf.add_to_exac:
        if not is_us():
            err('Exposing to ExAC browser is available only on US server')
            return
        output_dirpath = join(get_exac_dir(cnf), 'coverage', cnf.project_name)
        safe_mkdir(output_dirpath)
        if regions_fpath and regions_fpath != join(output_dirpath,
                                                   basename(regions_fpath)):
            shutil.copy(regions_fpath,
                        join(output_dirpath, basename(regions_fpath)))
            shutil.copy(regions_fpath + '.tbi',
                        join(output_dirpath, basename(regions_fpath + '.tbi')))
        samples = []
        sample_names = [s.name for bs in bcbio_structures for s in bs.samples]
        for bs in bcbio_structures:
            for sample in bs.samples:
                sample.name = get_uniq_sample_key(bs.project_name, sample,
                                                  sample_names)
                samples.append(sample)
        calculate_coverage_use_grid(cnf, samples, output_dirpath)
        add_project_to_exac(cnf)
    else:
        info('Use --exac if you want to export data to ExAC browser')
    info('Done.')
Exemplo n.º 9
0
def main():
    description = 'This script runs reporting suite on the bcbio final directory.'

    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)

    parser.add_option('--load-mongo',
                      '--mongo-loader',
                      dest='load_mongo',
                      action='store_true',
                      default=defaults['load_mongo'],
                      help='Load to Mongo DB')
    parser.add_option(
        '--datahub-path',
        dest='datahub_path',
        help=
        'DataHub directory path to upload final MAFs and CNV (can be remote).')
    parser.add_option(
        '--email',
        dest='email',
        help='E-mail address to send notifications on errors and finished jobs.'
    )
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='Re-annotate BED file with gene names')
    parser.add_option('--extended',
                      dest='extended',
                      action='store_true',
                      default=False,
                      help='Count flagged regions and missed variants')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='Count duplicates in coverage metrics')
    parser.add_option('--seq2c-opts',
                      dest='seq2c_opts',
                      help='Options for the final lr2gene.pl script.')
    parser.add_option('--seq2c-controls',
                      dest='seq2c_controls',
                      help='Additional controls for Seq2C.')
    parser.add_option('--deep-seq',
                      dest='deep_seq',
                      action='store_true',
                      default=False,
                      help='Use run_info_DeepSeq.yaml')
    parser.add_option('--wgs',
                      dest='is_wgs',
                      action='store_true',
                      default=None,
                      help='Ignore sv_regions and run as WGS')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true',
                      default=False,
                      help='Only generate project-level report')
    parser.add_option('--jira', dest='jira', help='JIRA case path')
    parser.add_option('--bed',
                      '--capture',
                      '--amplicons',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        dest='exons',
        help='Exons BED file to make targetSeq exon/amplicon regions reports.')
    parser.add_option('--no-prep-bed',
                      dest='prep_bed',
                      help='do not fix input beds and exons',
                      action='store_false',
                      default=True)
    parser.add_option('--no-dedup',
                      dest='no_dedup',
                      action='store_true',
                      help=SUPPRESS_HELP)
    parser.add_option('-f',
                      '--freq',
                      '--min-freq',
                      dest='min_freq',
                      type='float',
                      help='Minimum allele frequency for the filtering.')
    parser.add_option('-o',
                      dest='output_dir',
                      help='Output directory for report combining.')
    parser.add_option('--transcripts',
                      dest='transcripts_fpath',
                      help='Transcripts for annotation.')
    parser.add_option('--no-bam2bigwig',
                      dest='no_bam2bigwig',
                      action='store_true',
                      default=False,
                      help=SUPPRESS_HELP)

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)
    is_wgs = cnf.is_wgs = cnf.is_wgs or is_wgs_in_bcbio

    cnf.run_date = time.localtime()
    cnf_project_name = cnf.project_name
    if len(bcbio_project_dirpaths) > 1:
        cnf.project_name = None

    info()
    info('*' * 70)
    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf,
                            bcbio_project_dirpath,
                            bcbio_cnf,
                            final_dirpath,
                            is_wgs=is_wgs,
                            is_rnaseq=is_rnaseq)
        bcbio_structures.append(bs)

    # Post-processing one bcbio project as usually
    if len(bcbio_structures) == 1:
        if cnf.min_freq is not None:
            info('Min freq for filtering is %f' % cnf.min_freq)

        if cnf.steps and cnf.load_mongo and 'MongoLoader' not in cnf.steps:
            cnf.steps.append('MongoLoader')

        check_system_resources(cnf, required=['qsub'])

        bcbio_structure = bcbio_structures[0]

        bcbio_runner = BCBioRunner(cnf, bcbio_structure, cnf.bcbio_cnf)
        bcbio_runner.post_jobs()

    # Special case: multiple projects in input. No post-processing them, but rather combining summary reports together.
    elif len(bcbio_structures) > 1:
        if cnf_project_name:
            cnf.project_name = cnf_project_name
        else:
            cnf.project_name = '_'.join(
                [bs.project_name for bs in bcbio_structures])

        if not cnf.output_dir:
            cnf.output_dir = join(os.getcwd(), cnf.project_name)

        safe_mkdir(cnf.output_dir)

        cnf.log_dir = join(cnf.output_dir, 'log')
        info('log_dirpath: ' + cnf.log_dir)
        safe_mkdir(cnf.log_dir)
        set_up_log(cnf, 'miltiple_projects', cnf.project_name, cnf.output_dir)

        cnf.work_dir = adjust_path(join(cnf.output_dir, 'work'))
        safe_mkdir(cnf.work_dir)
        safe_mkdir(adjust_path(join(cnf.output_dir, 'config')))

        combine_projects(cnf, bcbio_structures, tags)
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script makes paired WGS-target clincial reports based on 2 bcbio projects.'

    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)

    parser.add_option(
        '--email',
        dest='email',
        help='E-mail address to send notifications on errors and finished jobs.'
    )
    parser.add_option('--jira', dest='jira', help='JIRA case path')
    parser.add_option('--bed',
                      '--capture',
                      '--amplicons',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        dest='exons',
        help='Exons BED file to make targetSeq exon/amplicon regions reports.')
    parser.add_option('-o',
                      dest='output_dir',
                      help='Output directory for report combining.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)
    is_wgs = cnf.is_wgs = cnf.is_wgs or is_wgs_in_bcbio

    if len(bcbio_project_dirpaths) < 2 or len(bcbio_project_dirpaths) > 2:
        critical('Usage: ' + __file__ + ' wgs_project_project_bcbio_path '
                 'targetseq_project_bcbio_path [-o output_dir]')

    info()
    info('*' * 70)
    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf,
                            bcbio_project_dirpath,
                            bcbio_cnf,
                            final_dirpath,
                            is_wgs=is_wgs,
                            is_rnaseq=is_rnaseq)
        bcbio_structures.append(bs)

    trg_bs = next((bs for bs in bcbio_structures if bs.bed), None)
    wgs_bs = next((bs for bs in bcbio_structures if not bs.bed), None)
    if not trg_bs and not wgs_bs:
        critical('One of the projects must be targeted, and one must be WGS')
    if not trg_bs:
        critical('One of the projects must be targeted.')
    if not wgs_bs:
        critical('One of the projects must be WGS.')

    if not cnf.project_name:
        cnf.project_name = wgs_bs.project_name.replace('_WGS',
                                                       '').replace('WGS', '')

    if cnf.output_dir is None:
        cnf.output_dir = join(os.getcwd(), cnf.project_name)
    safe_mkdir(cnf.output_dir)

    cnf.log_dir = join(cnf.output_dir, 'log')
    info('log_dirpath: ' + cnf.log_dir)
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf, 'clinical_target2wgs', cnf.project_name, cnf.output_dir)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    shared_sample_names = set(s.name for s in wgs_bs.samples) & set(
        s.name for s in trg_bs.samples)
    if not shared_sample_names:
        critical('Not shared samples in target and WGS projects.\n'
                 'Target: ' + ', '.join(s.name for s in trg_bs.samples) +
                 'WGS: ' + ', '.join(s.name for s in wgs_bs.samples))
    info('Shared samples: ' + ', '.join(shared_sample_names))

    info('')
    info('*' * 70)
    run_clinical_target2wgs(cnf.genome, wgs_bs, trg_bs, shared_sample_names,
                            cnf.output_dir)
Exemplo n.º 11
0
def get_args():
    info(' '.join(sys.argv))
    info()
    description = (
        'The program will filter the VarDict output after vcf2txt.pl to '
        'candidate interpretable mutations, somatic or germline.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)

    parser.add_option('-o', dest='output_file')
    parser.add_option('--o-all-transcripts',
                      dest='all_transcripts_output_file')
    parser.add_option('--o-fm', dest='fm_output_file')
    parser.add_option('--o-reject', dest='rejected_output_file')

    parser.add_option('--cohort-freqs', dest='cohort_freqs_fpath')
    parser.add_option('--transcripts', dest='transcripts_fpath')

    parser.add_option('-D',
                      '--min-depth',
                      dest='filt_depth',
                      type='int',
                      help='The minimum total depth')
    parser.add_option('-V',
                      '--min-vd',
                      dest='min_vd',
                      type='int',
                      help='The minimum reads supporting variant')
    parser.add_option(
        '--gmaf',
        dest='min_gmaf',
        type='float',
        help=
        'When the GMAF is greater than specified, it\'s considered common SNP and filtered out.'
    )
    parser.add_option(
        '-f',
        '--min-freq',
        dest='min_freq',
        type='float',
        help='The minimum allele frequency for regular variants.')
    parser.add_option(
        '-F',
        '--min-freq-hs',
        '--act-min-freq',
        dest='act_min_freq',
        type='float',
        help=
        'The minimum allele frequency hotspot somatic mutations, typically lower then -f. '
        'Default: 0.01 or half -f, whichever is less')
    parser.add_option(
        '-N',
        '--keep-utr-intronic',
        dest='keep_utr_intronic',
        action='store_true',
        help=
        'Keep all intronic and UTR in the output, but will be set as "unknown".'
    )

    parser.add_option(
        '-p',
        '--platform',
        dest='platform',
        help=
        'The platform, such as WXS, WGS, RNA-Seq, VALIDATION, etc. No Default. '
        'Used for output in FM\'s format')

    parser.set_usage('Usage: ' + __file__ +
                     ' vcf2txt_res_fpath [opts] -o output_fpath')

    (opts, args) = parser.parse_args()
    if len(args) < 1:
        critical('Provide the first argument - output from vcf2txt.pl')
    logger.is_debug = opts.debug

    vcf2txt_res_fpath = verify_file(args[0], is_critical=True)

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    check_genome_resources(cnf)

    if not cnf.output_file:
        critical('Please, specify the output fpath with -o')

    info()

    return cnf, vcf2txt_res_fpath
Exemplo n.º 12
0
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script makes clinical reports based on multiple bcbio projects.'

    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)

    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option(
        '--email',
        dest='email',
        help='E-mail address to send notifications on errors and finished jobs.'
    )
    parser.add_option('--metadata',
                      dest='metadata_csv',
                      help='CSV file with parameters of each sample.')
    parser.add_option('-o',
                      dest='output_dir',
                      help='Output directory for report combining.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)
    is_wgs = cnf.is_wgs = cnf.is_wgs or is_wgs_in_bcbio

    if not cnf.metadata_csv:
        critical(
            'Provide the path to CSV file with information of each sample')
        critical(
            'Usage: ' + __file__ +
            '  project_bcbio_path [project_bcbio_path] --metadata metadata_path [-o output_dir]'
        )

    cnf.sample_names = []
    parameters_info, samples_data = parse_samples_metadata(
        cnf, cnf.metadata_csv)

    info()
    info('*' * 70)
    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf,
                            bcbio_project_dirpath,
                            bcbio_cnf,
                            final_dirpath,
                            is_wgs=is_wgs,
                            is_rnaseq=is_rnaseq)
        bcbio_structures.append(bs)
        for s in bs.samples:
            assert s.targetcov_json_fpath, str(s.dirpath) + ' ' + str(
                s.targqc_dirpath)

    if cnf.output_dir is None and cnf.project_name is None:
        critical(
            'Either -o (output dir) or --project (project name) has to be specified'
        )

    if not cnf.output_dir:
        cnf.output_dir = join(os.getcwd(), cnf.project_name)
    if not cnf.project_name:
        cnf.project_name = 'Combined_project'

    cnf.output_dir = safe_mkdir(adjust_path(cnf.output_dir))

    cnf.log_dir = join(cnf.output_dir, 'log')
    info('log_dirpath: ' + cnf.log_dir)
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf, 'combine_clin_reports', cnf.project_name, cnf.output_dir)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    # shared_sample_names = set(s.name for bs in bcbio_structures for s in bs.samples)
    # if not shared_sample_names:
    #    sample_names = [bs.project_name + ': ' + ', '.join(s.name for s in bs.samples) for bs in bcbio_structures]
    #    critical('Not shared samples in projects.\n' + '\n'.join(sample_names))

    # info('Shared samples: ' + ', '.join(shared_sample_names))

    info('')
    info('*' * 70)
    run_combine_clinical_reports(cnf, bcbio_structures, parameters_info,
                                 samples_data)
Exemplo n.º 13
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--work-dir', dest='work_dir', metavar='DIR')
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='re-annotate BED file with gene names')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='count duplicates in coverage metrics')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        '--features',
        dest='features',
        help=
        'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.'
    )

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No BAMs provided to input.')
    bam_fpaths = list(set([abspath(a) for a in args]))

    bad_bam_fpaths = []
    for fpath in bam_fpaths:
        if not verify_bam(fpath):
            bad_bam_fpaths.append(fpath)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' +
                 ', '.join(bad_bam_fpaths))

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'TargQC'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name

    check_genome_resources(cnf)

    verify_bed(cnf.bed, is_critical=True)
    bed_fpath = adjust_path(cnf.bed)
    info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(
        cnf.features) if cnf.features else adjust_path(cnf.genome.features)
    info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    info('*' * 70)
    info()

    targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath,
                                   features_bed_fpath, genes_fpath)
    if targqc_html_fpath:
        send_email(
            cnf, 'TargQC report for ' + cnf.project_name + ':\n  ' +
            targqc_html_fpath)
Exemplo n.º 14
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--is-wgs',
                      dest='is_wgs',
                      action='store_true',
                      default=False,
                      help='whole genome sequencing')
    parser.add_option('--is-deep-seq',
                      dest='is_deep_seq',
                      action='store_true',
                      default=False,
                      help='deep targeted sequencing')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('-c', '--caller', dest='caller')
    parser.add_option('--qc', dest='qc', action='store_true', default=False)
    parser.add_option('--no-qc',
                      dest='qc',
                      action='store_false',
                      default=False)
    parser.add_option('--qc-caption', dest='qc_caption', help=SUPPRESS_HELP)
    parser.add_option('--no-tsv',
                      dest='tsv',
                      action='store_false',
                      default=True,
                      help=SUPPRESS_HELP)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No vcf files provided to input.')

    run_cnf = determine_run_cnf(opts,
                                is_targetseq=opts.is_deep_seq,
                                is_wgs=opts.is_wgs)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    vcf_fpath_by_sample = read_samples(args, cnf.caller)
    info()

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Variants'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name
    info(' '.join(sys.argv))

    samples = [
        source.VarSample(s_name, join(cnf.output_dir, s_name), vcf=vcf_fpath)
        for s_name, vcf_fpath in vcf_fpath_by_sample.items()
    ]
    samples.sort(key=lambda _s: _s.key_to_sort())

    check_genome_resources(cnf)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples
Exemplo n.º 15
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script prepare data for ExAC browser'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)

    parser.add_option('--log-dir', dest='log_dir', default='-')
    parser.add_option('--bed', dest='bed', help='BED file.')
    parser.add_option('--evaluate-capture-target',
                      dest='do_evaluate_capture',
                      action='store_true',
                      help='Evaluate capture target.')
    parser.add_option('-o',
                      dest='output_dir',
                      help='Output directory with ExAC data.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)

    if not cnf.genome:
        critical(
            'Usage: ' + __file__ +
            ' -g hg19 project_bcbio_path [project_bcbio_path] [--bed bed_fpath] [-o output_dir] [--evaluate-capture-target]'
        )
    cnf.output_dir = get_exac_dir(cnf)
    # if not cnf.output_dir:
    #     critical('Error! Please specify ExAC browser data directory')

    if len(bcbio_project_dirpaths) < 1:
        critical(
            'Usage: ' + __file__ +
            ' -g hg19 project_bcbio_path [project_bcbio_path] [--bed bed_fpath] [-o output_dir] [--evaluate-capture-target]'
        )

    info()
    info('*' * 70)
    bcbio_structures = []
    project_name = cnf.project_name
    cnf.project_name = None
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf,
                            final_dirpath)
        bcbio_structures.append(bs)

    cnf.project_name = project_name
    if not cnf.project_name:
        if len(bcbio_structures) == 1:
            cnf.project_name = bcbio_structures[0].project_name
        else:
            critical(
                'If you combine multiple BCBIO projects you should specify new project name'
            )
    cnf.caller_name = 'vardict'

    if cnf.output_dir is None:
        critical('Please specify path to ExAC data directory.')
    safe_mkdir(cnf.output_dir)

    cnf.log_dir = join(cnf.output_dir, cnf.project_name + '_log')
    info('log_dirpath: ' + cnf.log_dir)
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf, 'prepare_for_exac', cnf.project_name, cnf.output_dir)

    cnf.work_dir = cnf.work_dir or adjust_path(
        join(cnf.output_dir, 'work', cnf.project_name))
    safe_mkdir(cnf.work_dir)

    samples = []
    for bs in bcbio_structures:
        for sample in bs.samples:
            sample.name = get_uniq_sample_key(bs.project_name, sample)
            samples.append(sample)

    info()
    info('Preparing variants data')
    variants_dirpath = join(cnf.output_dir, 'vardict')
    safe_mkdir(variants_dirpath)
    combined_vcf_raw_fpath = join(variants_dirpath, cnf.project_name + '.vcf')
    combined_vcf_fpath = combined_vcf_raw_fpath + '.gz'
    if not cnf.reuse_intermediate or not verify_file(combined_vcf_fpath):
        vcf_fpath_by_sname = dict()
        for bs in bcbio_structures:
            pass_mut_fpaths = get_mutations_fpaths(bs)
            vcf_fpaths, pass_vcf_fpaths = convert_vardict_txts_to_bcbio_vcfs(
                cnf.work_dir,
                cnf.genome.name,
                pass_mut_fpaths,
                bs.samples,
                cnf.caller_name,
                output_dirpath=cnf.work_dir,
                pass_only=False,
                bed_fpath=bs.sv_bed,
                min_freq=bs.cnf.variant_filtering['min_freq'],
                act_min_freq=bs.cnf.variant_filtering['act_min_freq'])
            if not vcf_fpaths and not pass_vcf_fpaths:
                continue
            for sample, vcf_fpath, pass_vcf_fpath in zip(
                    bs.samples, vcf_fpaths, pass_vcf_fpaths):
                if vcf_fpath and verify_file(vcf_fpath):
                    vcf_fpath_by_sname[sample.name] = vcf_fpath
                elif pass_vcf_fpath and verify_file(pass_vcf_fpath):
                    vcf_fpath_by_sname[sample.name] = pass_vcf_fpath

        if not vcf_fpath_by_sname:
            info('No VCFs found, skipping preparing variants')
        else:
            info()
            combined_vcf_fpath = merge_vcfs(cnf, vcf_fpath_by_sname,
                                            combined_vcf_raw_fpath)
            project_vcf_dirpath = join(variants_dirpath, cnf.project_name)
            safe_mkdir(project_vcf_dirpath)
            for sample_name, vcf_fpath in vcf_fpath_by_sname.items():
                if verify_file(vcf_fpath) and not verify_file(join(
                        project_vcf_dirpath, basename(vcf_fpath)),
                                                              silent=True):
                    shutil.move(vcf_fpath, project_vcf_dirpath)
                    shutil.move(vcf_fpath + '.tbi', project_vcf_dirpath)

    info()
    info('Saving coverage')
    project_cov_dirpath = join(cnf.output_dir, 'coverage', cnf.project_name)
    safe_mkdir(project_cov_dirpath)
    calculate_coverage_use_grid(cnf, samples, project_cov_dirpath)
    if cnf.do_evaluate_capture:
        evaluate_capture(cnf, bcbio_project_dirpaths)

    if combined_vcf_fpath:
        info()
        info('Creating BAM files for IGV')
        exac_features_fpath = os.path.join(exac_data_dir, cnf.genome.name,
                                           'all_features.bed.gz')
        split_bam_files_use_grid(cnf, samples, combined_vcf_fpath,
                                 exac_features_fpath)
    else:
        warn(
            'Combined VCF file does not exist. BAM files for IGV cannot be created'
        )

    info()
    add_project_to_exac(cnf)
    info('Done.')