示例#1
0
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if not bammarkduplicates:
        bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
        if not bammarkduplicates:
            warn('No biobambam bammarkduplicates, can\'t mark duplicates.')
            return None

    out_bam_fpath = add_suffix(in_bam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_bam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = (
        '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}'
    ).format(**locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_bam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_bam_fpath
    else:
        return None
示例#2
0
def main():
    info(' '.join(sys.argv))
    info()
    parser = OptionParser(
        usage='Usage: ' + basename(__file__) +
        ' --bed BED_file --bam BAM_file -g hg19 -o Output_BEDGRAPH_file '
        '--work-dir work_directory --chr chromosome')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--bed', dest='bed', help='BED file.')
    parser.add_option('-g',
                      '--genome',
                      dest='chr_len_fpath',
                      help='File with chromosomes lengths.')
    parser.add_option('--work-dir', dest='work_dir', help='Work directory.')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    samples = [
        BaseSample(sample_name, None, bam=bam)
        for (sample_name,
             bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))
    ]

    if not cnf.output_dir or not cnf.bams:
        critical(parser.usage)

    safe_mkdir(cnf.output_dir)
    safe_mkdir(cnf.work_dir)
    get_regions_coverage(cnf, samples)
    info('Done.')
def run_vcf2txt_vardict2mut_for_samples(cnf,
                                        var_samples,
                                        output_dirpath,
                                        vcf2txt_out_fpath,
                                        caller_name=None,
                                        threads_num=1):

    threads_num = min(len(var_samples), cnf.threads)
    info('Number of threads for filtering: ' + str(threads_num))

    safe_mkdir(output_dirpath)

    vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples}
    res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath)
    if not res:
        err('vcf2txt run returned non-0')
        return None

    # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
    # if not vardict2mut_py:
    #     critical('vardict2mut_py not found')

    info('Running vardict2mut')
    res = run_vardict2mut(
        cnf, vcf2txt_out_fpath,
        add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix))
    if not res:
        critical('vardict2mut.py run returned non-0')
    mut_fpath = res
    mut_fpath = convert_gpfs_path_to_url(mut_fpath)
    info()

    info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath))
    return mut_fpath
def split_bams(cnf, samples, vcf_fpath):
    variants_by_chrom = parse_variants(vcf_fpath)
    temp_output_dirpath = join(cnf.work_dir, 'temp')
    safe_mkdir(temp_output_dirpath)
    info('Splitting BAM files...')
    for chrom, variants in variants_by_chrom.iteritems():
        chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
        chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths)
        chr_length = chr_lengths_dict[chrom]
        transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom)
        bams_created_before = []
        bams_by_sample = defaultdict(list)
        info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants')
        for variant in variants:
            variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath,
                 transcripts, chr_length, samples, chrom, variant, bams_created_before)
            bams_created_before.extend(variant_bams_by_sample.values())
            for sample_name, bam_fpath in variant_bams_by_sample.iteritems():
                bams_by_sample[sample_name].append(bam_fpath)
        chrom = chrom.replace('chr', '')
        info()
        for sample_name, bam_fpaths in bams_by_sample.iteritems():
            info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name)
            bam_fname = '{chrom}-{sample_name}.bam'.format(**locals())
            temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname)
            combined_bam_fpath = join(cnf.output_dir, bam_fname)
            generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath)
            info()
    info('Removing BAM files...')
    shutil.rmtree(temp_output_dirpath, ignore_errors=True)
示例#5
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
示例#6
0
def calculate_coverage_use_grid(cnf, samples, output_dirpath):
    assert len(samples) > 0

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)

    chr_len_fpath = get_chr_len_fpath(cnf)
    jobs_to_wait = []

    for sample in samples:
        sample_output_dirpath = join(output_dirpath, sample.name)
        safe_mkdir(sample_output_dirpath)

    for chrom in chromosomes:
        info('Processing chromosome ' + chrom)
        avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz')
        sample_output_fpaths = [
            join(output_dirpath, sample.name, chrom + '.txt.gz')
            for sample in samples
        ]

        sample_names = ','.join(sample.name for sample in samples)
        chrom_bams = []

        for sample in samples:
            if not verify_file(sample.bam):
                err('BAM for ' + sample.name + ' is not exist!')
                continue
            output_bam_fpath = join(
                cnf.work_dir,
                basename(sample.name) + '_' + str(chrom) + '.bam')
            cmdline = '{sambamba} slice {sample.bam} {chrom}'.format(
                **locals())
            call(cnf, cmdline, output_fpath=output_bam_fpath)
            if verify_file(output_bam_fpath):
                chrom_bams.append(output_bam_fpath)

        bam_fpaths = ','.join(chrom_bams)

        if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \
                all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths):
            info(avg_cov_output_fpath + ' exists, reusing')
        else:
            j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths,
                                   sample_names, output_dirpath, chr_len_fpath)
            if j and not j.is_done:
                jobs_to_wait.append(j)
            info()

        if len(jobs_to_wait) >= cnf.threads:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
            jobs_to_wait = []
        elif not jobs_to_wait:
            info('No jobs to submit.')
    if jobs_to_wait:
        wait_for_jobs(cnf, jobs_to_wait)
示例#7
0
def finialize_annotate_file(cnf, vcf_fpath, sample, callername=None):
    # vcf_fpath = leave_first_sample(cnf, vcf_fpath)

    # if not cnf.no_check:
    #     vcf_fpath = _filter_malformed_fields(cnf, vcf_fpath)

    if not cnf.no_check and callername and 'vardict' not in callername:
        info()
        info('Adding SAMPLE=' + sample.name + ' annotation...')
        vcf_fpath = add_annotation(cnf,
                                   vcf_fpath,
                                   'SAMPLE',
                                   sample.name,
                                   number='1',
                                   type_='String',
                                   description='Sample name')

    final_vcf_fpath = join(
        cnf.output_dir,
        sample.name + (('-' + callername) if callername else '') + '.anno.vcf')
    if cnf.output_file:
        final_vcf_fpath = cnf.output_file
    if not vcf_fpath.endswith('.gz') and final_vcf_fpath.endswith('.gz'):
        final_vcf_fpath = splitext(final_vcf_fpath)[0]
    if vcf_fpath.endswith('.gz') and not final_vcf_fpath.endswith('.gz'):
        final_vcf_fpath = final_vcf_fpath + '.gz'

    info('Moving final VCF ' + vcf_fpath + ' to ' + final_vcf_fpath)
    if isfile(final_vcf_fpath):
        os.remove(final_vcf_fpath)
    shutil.copy(vcf_fpath, final_vcf_fpath)

    if cnf.qc:
        report = qc.make_report(cnf, final_vcf_fpath, sample)
        qc_dirpath = join(cnf.output_dir, 'qc')
        safe_mkdir(qc_dirpath)
        report = qc.save_report(cnf, report, sample, callername, qc_dirpath,
                                source.varqc_name)
        info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
        info('-' * 70)
        info()

    if final_vcf_fpath.endswith('.gz'):
        if not is_gz(final_vcf_fpath):
            err(final_vcf_fpath + ' is in incorrect gzip format')
            anno_vcf_fpath_ungz = splitext(final_vcf_fpath)[0]
            anno_vcf_fpath_gz = final_vcf_fpath
            os.rename(anno_vcf_fpath_gz, anno_vcf_fpath_ungz)
        else:
            info(final_vcf_fpath + ' is a good gzipped file.')
            return [final_vcf_fpath]
    else:
        info('Compressing and indexing with bgzip+tabix ' + final_vcf_fpath)
        final_vcf_fpath = bgzip_and_tabix(cnf, final_vcf_fpath)
        info('Saved VCF again to ' + final_vcf_fpath)

    return [final_vcf_fpath]
示例#8
0
def combine_targqc(cnf, bcbio_structures, tag_by_sample):
    samples = [s for bs in bcbio_structures for s in bs.samples]
    output_dir = join(cnf.output_dir, BCBioStructure.targqc_summary_dir)
    safe_mkdir(output_dir)

    summarize_targetcov.summarize_targqc(cnf,
                                         cnf.threads or len(samples),
                                         output_dir,
                                         samples,
                                         tag_by_sample=tag_by_sample)
示例#9
0
def add_project_files_to_jbrowse(cnf, bcbio_structure):
    genome = cnf.genome.name
    jbrowse_data_path, _, _ = set_folders(genome)

    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath,
                                   bcbio_structure.project_name)

    safe_mkdir(jbrowse_project_dirpath)
    jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf')

    vcf_fpath_by_sample = None
    caller = bcbio_structure.variant_callers.get('vardict') or \
             bcbio_structure.variant_callers.get('vardict-java')
    if caller:
        vcf_fpath_by_sample = caller.get_filt_vcf_by_sample()

    for sample in bcbio_structure.samples:
        if sample.bam:
            index_bam(cnf, sample.bam, use_grid=True)

    for sample in bcbio_structure.samples:
        if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\
                and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample):
            info(sample.name + ' was exported to jBrowse previously.')
            continue
        vcf_link = None
        if vcf_fpath_by_sample:
            vcf_fpath = vcf_fpath_by_sample[
                sample.name] if sample.name in vcf_fpath_by_sample else None
            if vcf_fpath and verify_file(vcf_fpath):
                vcf_link = create_jbrowse_symlink(genome,
                                                  bcbio_structure.project_name,
                                                  sample.name, vcf_fpath)
                if not verify_file(vcf_fpath + '.tbi'):
                    cmdline = '{tabix} {vcf_fpath}'.format(**locals())
                    call(cnf, cmdline, exit_on_error=False)
                create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                       sample.name, vcf_fpath + '.tbi')

        if sample.bam:
            bam_link = create_jbrowse_symlink(genome,
                                              bcbio_structure.project_name,
                                              sample.name, sample.bam)
            create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                   sample.name, sample.bam + '.bai')
            bigwig_link = create_jbrowse_symlink(
                genome, bcbio_structure.project_name, sample.name,
                splitext(sample.bam)[0] + '.bigwig')
            print_sample_tracks_info(sample.name, bcbio_structure.project_name,
                                     trunc_symlink(bam_link),
                                     trunc_symlink(bigwig_link),
                                     trunc_symlink(vcf_link),
                                     jbrowse_tracks_fpath)
示例#10
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('--suffix',
                      dest='suffix',
                      default='subset',
                      help='Output files suffix')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=5e5,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(
        opts.right_reads_fpath,
        is_critical=True) if opts.right_reads_fpath else None
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    safe_mkdir(output_dirpath)
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        info('Downsampling to ' + str(cnf.downsample_to))
        downsample(cnf,
                   cnf.sample_name,
                   left_reads_fpath,
                   right_reads_fpath,
                   cnf.downsample_to,
                   output_dir=cnf.output_dir,
                   suffix=cnf.suffix)
def main():
    info(' '.join(sys.argv))
    info()
    cnf, bcbio_structure = bcbio_summary_script_proc_params(
            'expression', BCBioStructure.expression_dir)

    step_greetings('Gene expression heatmaps summary for all samples')
    report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM']
    genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf)
    for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names):
        counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname)
        if not verify_file(counts_fpath, silent=True):
            raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', ''))
            info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath)
            annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict)
        verify_file(counts_fpath, is_critical=True, description=counts_fname)

        isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath
        used_dict = transcripts_dict if isoforms_found else genes_dict
        report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')),
                            counts_fname.replace('.tsv', '') + '.html')

        make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath,
                                      report_caption_name, keep_gene_names=isoforms_found)
    info('Done')
示例#13
0
 def concat_fastqs(self, get_fastq_regexp, cnf):
     info('Preparing fastq files for the project named ' + self.name
          or self.az_project_name)
     if self.mergred_dir_found:
         info('  found already merged fastq dir, skipping.')
         return
     if not self.sample_by_name:
         err('  no samples found.')
         return
     safe_mkdir(self.fastq_dirpath)
     for s in self.sample_by_name.values():
         _concat_fastq(cnf, s.find_raw_fastq(get_fastq_regexp, 'R1'),
                       s.l_fpath)
         _concat_fastq(cnf, s.find_raw_fastq(get_fastq_regexp, 'R2'),
                       s.r_fpath)
     info()
示例#14
0
def _summarize_varqc(cnf, output_dir, samples, caption, post_filter=False):
    name = source.varqc_name
    if post_filter:
        name = source.varqc_after_name
    varqc_dir = join(output_dir, name)
    safe_mkdir(varqc_dir)

    info('VarQC ' + ('(post-filtering) ' if post_filter else '') +
         'summary, saving to ' + output_dir)

    jsons_by_sample = dict()
    for s in samples:
        fpath = join((s.varannotate_dirpath
                      if not post_filter else s.varfilter_dirpath), 'qc',
                     s.name + (('-' + cnf.caller) if cnf.caller else '') +
                     '.' + name + '.json')
        if verify_file(fpath):
            jsons_by_sample[s.name] = fpath

    htmls_by_sample = dict()
    for s in samples:
        fpath = join((s.varannotate_dirpath
                      if not post_filter else s.varfilter_dirpath), 'qc',
                     s.name + (('-' + cnf.caller) if cnf.caller else '') +
                     '.' + name + '.html')
        if verify_file(fpath):
            htmls_by_sample[s.name] = fpath

    report = FullReport.construct_from_sample_report_jsons(
        samples,
        output_dir,
        jsons_by_sample=jsons_by_sample,
        htmls_by_sample=htmls_by_sample)
    full_summary_fpaths = report.save_into_files(
        cnf,
        join(varqc_dir, name),
        caption='Variant QC' + (' post-varfilter' if post_filter else '') +
        ((', ' + caption) if caption else ''))

    info()
    info('*' * 70)
    for fpath in full_summary_fpaths:
        if fpath:
            info(fpath)

    return full_summary_fpaths
示例#15
0
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir):
    picard = get_system_path(cnf, 'java', 'picard')
    if picard:
        safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath))
        safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath))
        info('Picard ins size hist for "' + basename(bam_fpath) + '"')
        cmdline = '{picard} CollectInsertSizeMetrics' \
                  ' I={bam_fpath}' \
                  ' O={sample.picard_ins_size_hist_txt_fpath}' \
                  ' H={sample.picard_ins_size_hist_pdf_fpath}' \
                  ' VALIDATION_STRINGENCY=LENIENT'

        cmdline = cmdline.format(**locals())
        call(cnf,
             cmdline,
             output_fpath=sample.picard_ins_size_hist_txt_fpath,
             stdout_to_outputfile=False,
             exit_on_error=False)
示例#16
0
def _symlink_to_dir(fpath, dirpath):
    if not isdir(dirpath):
        safe_mkdir(dirpath)

    dst_path = join(dirpath, basename(fpath))

    if islink(dst_path) or isfile(dst_path):
        try:
            os.remove(dst_path)
        except OSError:
            err('Cannot symlink ' + fpath + ' -> ' + dst_path +
                ': cannot remove ' + dst_path)
            return

    try:
        symlink_plus(fpath, dst_path)
    except OSError:
        err('Cannot symlink ' + fpath + ' -> ' + dst_path)
示例#17
0
def run_fastqc(cnf,
               fastq_fpath,
               output_basename,
               fastqc_dirpath,
               need_downsample=True):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + output_basename + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline_l = '{fastqc} --dir {tmp_dirpath} --extract -o {fastqc_dirpath} -f fastq -j {java} {fastq_fpath}'.format(
        **locals())
    j = submit_job(cnf,
                   cmdline_l,
                   'FastQC_' + output_basename,
                   run_on_chara=True,
                   stdout_to_outputfile=False)
    # output_fpath=join(fastqc_dirpath, output_basename + '_fastqc', 'fastqc_report.html'))
    return j
示例#18
0
def create_jbrowse_symlink(genome, project_name, sample, file_fpath):
    jbrowse_data_path, _, _ = set_folders(genome)
    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath, project_name)
    base, ext = splitext_plus(file_fpath)
    if ext in ['.tbi', '.bai']:
        base, ext2 = splitext_plus(base)
        ext = ext2 + ext
    sym_link = join(jbrowse_project_dirpath, sample + ext)
    if not verify_dir(jbrowse_project_dirpath):
        safe_mkdir(jbrowse_project_dirpath)
    if isfile(file_fpath) and not isfile(sym_link):
        try:
            os.symlink(file_fpath, sym_link)
        except OSError:
            warn(traceback.format_exc())
    if isfile(sym_link):
        change_permissions(sym_link)
    return sym_link
def set_up_dirs(cnf, log_dir_name='log'):
    """ Creates output_dir, work_dir; sets up log
    """
    if cnf.output_dir:
        cnf.output_dir = adjust_path(cnf.output_dir)
        safe_mkdir(cnf.output_dir, 'output_dir')
        info('Saving into ' + cnf.output_dir)

    set_up_work_dir(cnf)

    if cnf.log_dir == '-':
        cnf.log_dir = None
    else:
        if not cnf.log_dir:
            cnf.log_dir = join(cnf.work_dir, log_dir_name)
        safe_mkdir(cnf.log_dir)
        info('Created log dir ' + cnf.log_dir)

    set_up_log(cnf)
示例#20
0
def run_targqc(cnf, bam_by_sample, bed_fpath, output_dirpath):
    info('Running TargQC for downsampled BAMs')

    targqc = get_script_cmdline(cnf, 'python', 'targqc.py', is_critical=True)
    targqc_work_dir = join(cnf.work_dir, 'TargQC')
    targqc_log_dir = join(cnf.log_dir, 'TargQC')
    safe_mkdir(targqc_work_dir)
    safe_mkdir(targqc_log_dir)
    bed_cmdl = ''
    if bed_fpath:
        bed_cmdl = '--bed ' + bed_fpath
    bam_cmdl = ' '.join(bam_fpath + ',' + sname
                        for sname, bam_fpath in bam_by_sample.items())
    cmdl = '{targqc} --sys-cnf {cnf.sys_cnf} {bam_cmdl} {bed_cmdl} ' \
           '--work-dir {targqc_work_dir} --log-dir {targqc_log_dir} --project-name {cnf.project_name} ' \
           '-o {output_dirpath} --genome {cnf.genome.name}'.format(**locals())
    if cnf.reuse_intermediate:
        cmdl += ' --reuse'
    call(cnf, cmdl)
示例#21
0
def merge_bcbio_yamls(cnf, bcbio_structures):
    today_date = datetime.datetime.now()
    today_bcbio_date = today_date.strftime("%Y-%m-%d")
    safe_mkdir(today_bcbio_date)

    bcbio_cnfs = [bs.bcbio_cnf for bs in bcbio_structures]
    merged_yaml_fpath = join(cnf.output_dir, 'config', 'bcbio.yaml')
    merged_bcbio_cnf = dict()
    merged_bcbio_cnf['fc_date'] = today_bcbio_date
    merged_bcbio_cnf['fc_name'] = 'bcbio'
    merged_bcbio_cnf['upload'] = bcbio_cnfs[0]['upload']
    merged_bcbio_cnf['details'] = []
    for bs_cnf in bcbio_cnfs:
        bs_cnf['fc_date'] = today_bcbio_date
        bs_cnf['fc_name'] = 'bcbio'
        merged_bcbio_cnf['details'].extend(bs_cnf['details'])
    with open(merged_yaml_fpath, 'w') as yaml_file:
        yaml_file.write(save_yaml(merged_bcbio_cnf))
    return merged_bcbio_cnf
示例#22
0
def combine_varqc(cnf, bcbio_structures, tag_by_sample, varqc_dirname,
                  varqc_name, caption):
    callers = []
    samples = []

    for bc in bcbio_structures:
        for vc in bc.variant_callers.values():
            if vc.name not in [c.name for c in callers]:
                callers.append(vc)

    jsons_by_sample_by_caller = defaultdict(dict)
    htmls_by_sample_by_caller = defaultdict(dict)
    for bc in bcbio_structures:
        for vc in bc.variant_callers.values():
            fpath_by_sample = vc.find_fpaths_by_sample(varqc_dirname,
                                                       varqc_name, 'json',
                                                       bc.final_dirpath)
            for sname, fpath in fpath_by_sample.items():
                jsons_by_sample_by_caller[vc.name][sname] = fpath
            fpath_by_sample = vc.find_fpaths_by_sample(varqc_dirname,
                                                       varqc_name, 'html',
                                                       bc.final_dirpath)
            for sname, fpath in fpath_by_sample.items():
                htmls_by_sample_by_caller[vc.name][sname] = fpath
            samples.extend(vc.samples)

    output_dir = join(cnf.output_dir, varqc_dirname)
    safe_mkdir(output_dir)

    if jsons_by_sample_by_caller and htmls_by_sample_by_caller:
        summarize_qc.make_summary_reports(cnf,
                                          1,
                                          output_dir,
                                          callers,
                                          samples,
                                          jsons_by_sample_by_caller,
                                          htmls_by_sample_by_caller,
                                          tag_by_sample,
                                          varqc_name=varqc_name,
                                          caption=caption)
    else:
        err('Not JSON and HTML found, cannot generate summary reports.')
示例#23
0
def run_fastq(cnf,
              sample_name,
              l_r_fpath,
              r_r_fpath,
              output_dirpath,
              downsample_to=1e7):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)

    if downsample_to:
        info('Downsampling to ' + str(downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample_name,
                                      l_r_fpath,
                                      r_r_fpath,
                                      downsample_to,
                                      output_dir=cnf.work_dir)

    # Joining fastq files to run on a combination
    fastqc_fpath = join(cnf.work_dir, sample_name + '.fq')
    info('Combining fastqs, writing to ' + fastqc_fpath)
    with open(fastqc_fpath, 'w') as out:
        out.write(open_gzipsafe(l_r_fpath).read())
        out.write(open_gzipsafe(r_r_fpath).read())

    # Running FastQC
    info('Running FastQC')
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format(
        **locals())
    call(cnf, cmdline)

    # Cleaning and getting report
    sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc')
    if isfile(sample_fastqc_dirpath + '.zip'):
        os.remove(sample_fastqc_dirpath + '.zip')
    fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html')
    verify_file(fastqc_html_fpath, is_critical=True)

    return sample_fastqc_dirpath
def set_up_work_dir(cnf):
    # timestamp = str(datetime.datetime.now())
    # user_prid = getpass.getuser()
    # hasher = hashlib.sha1( + timestamp)
    # path_hash = base64.urlsafe_b64encode(hasher.digest()[0:4])[:-1]

    if not cnf.work_dir:
        if cnf.output_dir:
            work_dir_name = 'work' + ('_' + cnf.sample if cnf.sample else '')
            cnf.work_dir = join(cnf.output_dir, work_dir_name)
            info('Work dir: ' + cnf.work_dir)
            # if not cnf.reuse_intermediate and isdir(cnf.work_dir):
            #     rmtree(cnf.work_dir)
        else:
            cnf.work_dir = tempfile.mkdtemp()
            info('Creating temprorary directory for work dir: ' + cnf.work_dir)
    else:
        cnf.work_dir = adjust_path(cnf.work_dir)
        info('Work dir: ' + cnf.work_dir)

    safe_mkdir(cnf.work_dir, 'working directory')
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script converts Vardict TXT file to VCF.'

    parser = OptionParser(
        description=description,
        usage='Usage: ' + basename(__file__) +
        ' [-o Output_directory -c Var_caller_name] Project_directory')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir', default='-')
    parser.add_option('-c', '--caller', dest='caller_name', default='vardict')
    parser.add_option('-o', dest='output_dir', help='Output directory.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)

    if not bcbio_project_dirpaths:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf,
                            final_dirpath)
        bcbio_structures.append(bs)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    info('')
    info('*' * 70)
    for bs in bcbio_structures:
        for sample in bs.samples:
            if sample.phenotype != 'normal':
                convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample)
示例#26
0
def markdup_sam(cnf, in_sam_fpath, samblaster=None):
    """Perform non-stream based deduplication of SAM input files using samblaster.
    """
    if not samblaster:
        samblaster = get_system_path(cnf, 'samblaster')
        if not samblaster:
            warn('No samblaster, can\'t mark duplicates.')
            return None

    out_sam_fpath = add_suffix(in_sam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_sam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format(
        **locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_sam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_sam_fpath
    else:
        return None
示例#27
0
def generate_flagged_regions_report(cnf, output_dir, sample, ave_depth,
                                    gene_by_key):
    depth_threshs = cnf.coverage_reports.depth_thresholds
    report = PerRegionSampleReport(
        sample=sample,
        metric_storage=get_detailed_metric_storage(depth_threshs))
    report.add_record('Sample', sample.name)
    safe_mkdir(sample.flagged_regions_dirpath)
    ''' 1. Detect depth threshold (ave sample coverage * DEPTH_THRESH_FROM_AVE_COV)
        2. Select regions covered in less than MIN_DEPTH_PERCENT_AT_THRESH at threshold
        3. Sort by % at threshold
        4. Select those parts of those regions where % = 0, save to BED
        5. Find HotSpots at those regions
        6. Intersect HotSpots with tracks

        For each gene where are regions with parts % = 0:
            sort them by part where % = 0
    '''
    #vcf_dbs = ['oncomine', 'dbsnp', 'cosmic']
    vcf_dbs = ['oncomine']

    from source._deprecated_clinical_reporting.clinical_parser import get_key_or_target_bed_genes
    key_genes, _ = get_key_or_target_bed_genes(
        cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes'))
    depth_cutoff = get_depth_cutoff(ave_depth, depth_threshs)
    genes_sorted = sorted(gene_by_key.values())
    min_cov, max_cov = min_and_max_based_on_outliers(genes_sorted)

    for coverage_type in ['low', 'high']:
        info('Selecting and saving ' + coverage_type + ' covered genes')
        selected_genes = []

        if coverage_type == 'low':
            selected_genes = [
                g for g in genes_sorted if g.gene_name in key_genes and (any(
                    e.rates_within_threshs[depth_cutoff] <
                    MIN_DEPTH_PERCENT_AT_THRESH for e in g.get_exons()) or any(
                        a.rates_within_threshs[depth_cutoff] <
                        MIN_DEPTH_PERCENT_AT_THRESH
                        for a in g.get_amplicons()))
            ]
        else:
            if max_cov:
                selected_genes = [
                    g for g in genes_sorted
                    if g.gene_name in key_genes and (any(
                        e.avg_depth > max_cov for e in g.get_exons()) or any(
                            a.avg_depth > max_cov for a in g.get_amplicons()))
                ]
        for region_type in ['exons', 'target']:
            selected_regions = []
            for gene in selected_genes:
                if coverage_type == 'low':
                    cur_regions = [
                        a for a in (gene.get_amplicons() if region_type ==
                                    'target' else gene.get_exons())
                        if a.rates_within_threshs[depth_cutoff] <
                        MIN_DEPTH_PERCENT_AT_THRESH
                        and 'Multi' not in a.feature
                    ]
                else:
                    cur_regions = [
                        a for a in (gene.get_amplicons() if region_type ==
                                    'target' else gene.get_exons())
                        if a.avg_depth > max_cov and 'Multi' not in a.feature
                    ]
                selected_regions.extend(cur_regions)

            if selected_regions:
                selected_regions_bed_fpath = join(
                    sample.flagged_regions_dirpath,
                    coverage_type + '_cov_' + region_type + '.bed')
                save_regions_to_bed(cnf, selected_regions,
                                    selected_regions_bed_fpath)

                # Report cov for Hotspots
                for db in vcf_dbs:
                    res = _report_normalize_coverage_for_variant_sites(
                        cnf, sample, ave_depth, db, selected_regions_bed_fpath,
                        selected_regions, depth_cutoff, region_type,
                        coverage_type)
                    if not res:
                        return None

            report = make_flat_region_report(sample, selected_regions,
                                             depth_threshs)
            flagged_txt_fpath = add_suffix(
                add_suffix(sample.flagged_txt, region_type), coverage_type)
            flagged_tsv_fpath = add_suffix(
                add_suffix(sample.flagged_tsv, region_type), coverage_type)
            report.save_txt(flagged_txt_fpath)
            report.save_tsv(flagged_tsv_fpath)

            info('')
            info(coverage_type + ' covered ' + region_type + '(total ' +
                 str(len(selected_regions)) + ') for sample ' + sample.name +
                 ' saved into:')
            info('  ' + flagged_txt_fpath + ', ' + flagged_tsv_fpath)

    return report
示例#28
0
def __get_mapped_reads(cnf, samples, bam_by_sample, output_fpath):
    if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
        info(output_fpath + ' exists, reusing')
        return output_fpath, samples

    mapped_reads_by_sample = OrderedDict()

    job_by_sample = dict()
    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0
    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for s in not_submitted_samples:
            with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf:
                safe_mkdir(cnf.work_dir)
                # if verify_file(s.targetcov_json_fpath, silent=True):
                #     info('Parsing targetSeq output ' + s.targetcov_json_fpath)
                #     with open(s.targetcov_json_fpath) as f:
                #         data = load(f, object_pairs_hook=OrderedDict)
                #     cov_report = SampleReport.load(data, s)
                #     mapped_reads = next(rec.value for rec in cov_report.records if rec.metric.name == 'Mapped reads')
                #     info(s.name + ': ')
                #     info('  Mapped reads: ' + str(mapped_reads))
                #     mapped_reads_by_sample[s.name] = mapped_reads
                #     reused_samples.append(s)
                #     continue
                #
                # else:
                if s.name not in bam_by_sample:
                    err('No BAM for ' + s.name + ', not running Seq2C')
                    return None, None

                info('Submitting a sambamba job to get mapped read numbers')
                bam_fpath = bam_by_sample[s.name]
                j = number_of_mapped_reads(cnf, bam_fpath, dedup=True, use_grid=True, sample_name=s.name)
                job_by_sample[s.name] = j
                submitted_samples.append(s)
                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    not_submitted_samples = [_s for _s in not_submitted_samples if
                                             _s not in submitted_samples and
                                             _s not in reused_samples]

                    if not_submitted_samples:
                        info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before '
                                 'submitting more ' + str(len(not_submitted_samples)))
                    else:
                        info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                    info()
                    break
                info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if 'work_dir' in j.__dict__ and isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [s for s in not_submitted_samples if
                                 s not in submitted_samples and
                                 s not in reused_samples]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()

    # wait_for_jobs(cnf, job_by_sample.values())
    for s_name, j in job_by_sample.items():
        if j and j.is_done and not j.is_failed:
            with open(j.output_fpath) as f:
                mapped_reads = int(f.read().strip())
                info(s_name + ': ')
                info('  Mapped reads: ' + str(mapped_reads))
                mapped_reads_by_sample[s_name] = mapped_reads
        else:
            err('ERROR: ' + s_name + ' could not get mapped reads, log saved to ' + j.log_fpath)

    with open(output_fpath, 'w') as f:
        for sample_name, mapped_reads in mapped_reads_by_sample.items():
            f.write(sample_name + '\t' + str(mapped_reads) + '\n')

    verify_file(output_fpath, is_critical=True)
    successful_samples = [s for s in samples if s.name in mapped_reads_by_sample]
    info('Samples processed: ' + str(len(samples)) + ', successfully: ' + str(len(successful_samples)))
    return output_fpath, successful_samples
示例#29
0
def __seq2c_coverage(cnf, samples, bams_by_sample, bed_fpath, is_wgs, output_fpath):
    if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    jobs_by_sample = dict()
    depth_output_by_sample = dict()
    seq2cov_output_by_sample = dict()
    seq2c_work_dirpath = join(cnf.work_dir, source.seq2c_name)
    safe_mkdir(seq2c_work_dirpath)
    info()

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0
    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []

        for s in not_submitted_samples:
            info('*' * 50)
            info(s.name + ':')
            with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf:
                safe_mkdir(cnf.work_dir)
                seq2cov_output_by_sample[s.name] = join(seq2c_work_dirpath, s.name + '.seq2cov.txt')

                if not cnf.reuse_intermediate and isfile(seq2cov_output_by_sample[s.name]):
                    os.remove(seq2cov_output_by_sample[s.name])

                if cnf.reuse_intermediate and verify_file(seq2cov_output_by_sample[s.name], silent=True):
                    info(seq2cov_output_by_sample[s.name] + ' exists, reusing')
                    reused_samples.append(s)
                    continue

                elif verify_file(s.targetcov_detailed_tsv, silent=True):
                    info('Using targetcov detailed output for Seq2C coverage.')
                    info(s.name + ': using targetseq output')
                    targetcov_details_to_seq2cov(cnf, s.targetcov_detailed_tsv, seq2cov_output_by_sample[s.name], s.name, is_wgs=is_wgs)
                    reused_samples.append(s)
                    continue

                else:
                    info(s.name + ': ' + s.targetcov_detailed_tsv + ' does not exist: submitting sambamba depth')
                    bam_fpath = bams_by_sample[s.name]
                    depth_output = join(seq2c_work_dirpath, s.name + '_depth' + '.txt')
                    depth_output_by_sample[s.name] = depth_output
                    if cnf.reuse_intermediate and verify_file(depth_output, silent=True):
                        info(depth_output + ' exists, reusing')
                        reused_samples.append(s)
                        continue
                    else:
                        j = sambamba_depth(cnf, bed_fpath, bam_fpath, depth_output, use_grid=True, sample_name=s.name)
                        jobs_by_sample[s.name] = j
                        submitted_samples.append(s)

                        if not j.is_done:
                            jobs_to_wait.append(j)

                        if len(jobs_to_wait) >= cnf.threads:
                            not_submitted_samples = [_s for _s in not_submitted_samples if
                                                     _s not in submitted_samples and
                                                     _s not in reused_samples]

                            if not_submitted_samples:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before '
                                         'submitting more ' + str(len(not_submitted_samples)))
                            else:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                            info()
                            break
                        info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if 'work_dir' in j.__dict__ and isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [s for s in not_submitted_samples if
                                 s not in submitted_samples and
                                 s not in reused_samples]
        info()
        info('*' * 50)

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))

    # wait_for_jobs(cnf, jobs_by_sample.values())
    for s_name, seq2cov_output_fpath in seq2cov_output_by_sample.items():
        if not isfile(seq2cov_output_fpath):
            if verify_file(depth_output_by_sample[s_name], is_critical=True, description='depth_output_by_sample for ' + s_name):
                info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
                bed_col_num = count_bed_cols(bed_fpath)
                sambamba_depth_to_seq2cov(cnf, depth_output_by_sample[s_name], seq2cov_output_by_sample[s_name], s_name, bed_col_num)

            # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
            #                             is_critical=True)
            # bedcov_hist_fpath = depth_output_by_sample[s_name]
            # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
            # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
            # sum_jobs_by_sample[s_name] = j

    # sum_jobs_by_sample = dict()
    # info('* Submitting seq2cov output *')
    # for s_name, j in jobs_by_sample.items():
    #     if not verify_file(seq2cov_output_by_sample[s_name], silent=True):
    #         info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
    #
    #         script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
    #                                     is_critical=True)
    #         bedcov_hist_fpath = depth_output_by_sample[s_name]
    #         bed_col_num = count_bed_cols(seq2c_bed)
    #         cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
    #         j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
    #         sum_jobs_by_sample[s_name] = j
    #
    # wait_for_jobs(cnf, sum_jobs_by_sample.values())

    info()
    info('Done')
    info('*' * 50)
    info()
    info('Combining seq2cov output')
    with open(output_fpath, 'w') as out:
        for i, s in enumerate(samples):
            verify_file(seq2cov_output_by_sample[s.name], description='seq2cov_output for ' + s.name, is_critical=True)
            with open(seq2cov_output_by_sample[s.name]) as inp:
                for l in inp:
                    out.write(l)

    verify_file(output_fpath, description='__simulate_cov2cnv_w_bedtools output_fpath', is_critical=True)
    info('Saved combined seq2cov output to ' + output_fpath)
    info()
    return output_fpath
示例#30
0
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants,
                    mutations, vcf2txt_res_fpath):
    if cnf is None:
        global glob_cnf
        cnf = glob_cnf

    info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
         ': writing filtered VCFs')

    filter_values = set(variants.values())

    # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf
    ungz, gz = None, None
    if var_sample.filt_vcf_fpath.endswith('.gz'):
        ungz = splitext(var_sample.filt_vcf_fpath)[0]
        gz = var_sample.filt_vcf_fpath
    else:
        ungz = var_sample.filt_vcf_fpath
        gz = var_sample.filt_vcf_fpath + '.gz'
    if not var_sample.filt_tsv_fpath:
        var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv'

    if cnf.reuse_intermediate \
            and verify_file(var_sample.filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.filt_tsv_fpath, silent=True):
        info(var_sample.filt_vcf_fpath + ' and ' +
             var_sample.pass_filt_vcf_fpath + ' exist; reusing.')

    else:
        safe_mkdir(dirname(var_sample.filt_vcf_fpath))
        safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath))

        with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \
             file_transaction(work_dir, ungz) as filt_tx, \
             file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx:
            with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f:
                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': opened ' + var_sample.anno_vcf_fpath +
                     ', writing to ' + ungz + ' and ' +
                     var_sample.pass_filt_vcf_fpath)

                for l in vcf_f:
                    if l.startswith('#'):
                        if l.startswith('#CHROM'):
                            filt_f.write(
                                '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n'
                            )
                            filt_f.write(
                                '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n'
                            )
                            for filt_val in filter_values:
                                if filt_val != 'PASS':
                                    filt_f.write('##FILTER=<ID=' + filt_val +
                                                 ',Description="">\n')
                        filt_f.write(l)
                        pass_f.write(l)
                    else:
                        ts = l.split('\t')
                        chrom, pos, alt = ts[0], ts[1], ts[4]
                        if (chrom, pos, alt) in mutations:
                            ts[6] = 'PASS'
                            filt_f.write('\t'.join(ts))
                            pass_f.write('\t'.join(ts))
                        else:
                            if ts[6] in ['', '.', 'PASS']:
                                ts[6] = ''
                                filter_value = variants.get((chrom, pos, alt))
                                if filter_value is None:
                                    ts[6] += 'vcf2txt'
                                elif filter_value == 'TRUE':
                                    ts[6] += 'vardict2mut'
                                else:
                                    ts[6] += filter_value
                            filt_f.write('\t'.join(ts))

        info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
             ': saved filtered VCFs to ' + ungz + ' and ' +
             var_sample.pass_filt_vcf_fpath)

        if False:
            info()
            info(var_sample.name +
                 ((', ' + caller_name) if caller_name else '') +
                 ': writing filtered TSVs')
            # Converting to TSV - saving .anno.filt.tsv
            if 'tsv_fields' in cnf.annotation and cnf.tsv:
                tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name)
                if not tmp_tsv_fpath:
                    err('TSV convertion didn\'t work')
                else:
                    if isfile(var_sample.filt_tsv_fpath):
                        os.remove(var_sample.filt_tsv_fpath)
                    shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath)

                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': saved filtered TSV to ' + var_sample.filt_tsv_fpath)

    info('Done postprocessing filtered VCF.')
    return ungz