Пример #1
0
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if not bammarkduplicates:
        bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
        if not bammarkduplicates:
            warn('No biobambam bammarkduplicates, can\'t mark duplicates.')
            return None

    out_bam_fpath = add_suffix(in_bam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_bam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = (
        '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}'
    ).format(**locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_bam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_bam_fpath
    else:
        return None
Пример #2
0
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath):
    info('WGS; running vcftxt separately for each sample to save memory.')
    vcf2txt_outputs_by_vcf_fpath = OrderedDict()
    for vcf_fpath in vcf_fpath_by_sample.values():
        sample_output_fpath = add_suffix(vcf2txt_out_fpath,
                                         splitext(basename(vcf_fpath))[0])
        vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath
        info()

    info('Joining vcf2txt ouputs... (' +
         str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' +
         str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' +
         vcf2txt_out_fpath)
    with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx:
        with open(tx, 'w') as out:
            for i, (vcf_fpath, sample_output_fpath) in enumerate(
                    vcf2txt_outputs_by_vcf_fpath.items()):
                info('   Reading ' + sample_output_fpath)
                with open(sample_output_fpath) as inp:
                    for j, l in enumerate(inp):
                        if j == 0 and i != 0:
                            continue
                        out.write(l)
    if verify_file(vcf2txt_out_fpath):
        info('Saved ' + vcf2txt_out_fpath)
        return vcf2txt_out_fpath
    else:
        return None
Пример #3
0
def run_vcf2txt_vardict2mut_for_samples(cnf,
                                        var_samples,
                                        output_dirpath,
                                        vcf2txt_out_fpath,
                                        caller_name=None,
                                        threads_num=1):

    threads_num = min(len(var_samples), cnf.threads)
    info('Number of threads for filtering: ' + str(threads_num))

    safe_mkdir(output_dirpath)

    vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples}
    res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath)
    if not res:
        err('vcf2txt run returned non-0')
        return None

    # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
    # if not vardict2mut_py:
    #     critical('vardict2mut_py not found')

    info('Running vardict2mut')
    res = run_vardict2mut(
        cnf, vcf2txt_out_fpath,
        add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix))
    if not res:
        critical('vardict2mut.py run returned non-0')
    mut_fpath = res
    mut_fpath = convert_gpfs_path_to_url(mut_fpath)
    info()

    info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath))
    return mut_fpath
Пример #4
0
def _mutations_records(general_section, bcbio_structure, base_dirpath):
    records = []

    caller = bcbio_structure.variant_callers.get('vardict') or \
             bcbio_structure.variant_callers.get('vardict-java')

    _base_mut_fname = variant_filtering.mut_fname_template.format(
        caller_name=caller.name)
    _base_mut_fpath = join(bcbio_structure.date_dirpath, _base_mut_fname)
    mut_fpath = add_suffix(_base_mut_fpath, variant_filtering.mut_pass_suffix)
    single_mut_fpath = add_suffix(
        add_suffix(_base_mut_fpath, variant_filtering.mut_single_suffix),
        variant_filtering.mut_pass_suffix)
    paired_mut_fpath = add_suffix(
        add_suffix(_base_mut_fpath, variant_filtering.mut_paired_suffix),
        variant_filtering.mut_pass_suffix)
    mut_fpath = verify_file(mut_fpath, silent=True)
    single_mut_fpath = verify_file(single_mut_fpath, silent=True)
    paired_mut_fpath = verify_file(paired_mut_fpath, silent=True)

    for fpath, metric_name in ((mut_fpath, MUTATIONS_NAME),
                               (single_mut_fpath, MUTATIONS_SINGLE_NAME),
                               (paired_mut_fpath, MUTATIONS_PAIRED_NAME)):
        if fpath:
            metric = Metric(metric_name, common=True)
            rec = Record(metric=metric,
                         value=basename(fpath),
                         url=relpath(fpath, base_dirpath))
            general_section.add_metric(metric)
            records.append(rec)

    if bcbio_structure.seq2c_fpath and isfile(bcbio_structure.seq2c_fpath):
        metric = Metric(CNV_NAME, common=True)
        fpath = bcbio_structure.seq2c_fpath
        rec = Record(metric=metric,
                     value=basename(fpath),
                     url=relpath(fpath, base_dirpath))
        general_section.add_metric(metric)
        records.append(rec)

    return records
Пример #5
0
def count_mutations_freq(cnf,
                         samples,
                         vcf2txt_fpaths,
                         suffix=variant_filtering.mut_pass_suffix):
    count_in_cohort_by_vark = defaultdict(int)
    total_varks = 0
    total_duplicated_count = 0
    total_records_count = 0
    for sample_i, (sample,
                   vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)):
        met_in_this_sample = set()
        processed_fpath = add_suffix(vcf2txt_fpath, suffix)
        if not isfile(processed_fpath):
            critical(processed_fpath +
                     ' does not exist; please, rerun VarFilter.')
        with open(processed_fpath) as f:
            for line_i, l in enumerate(f):
                if line_i > 0:
                    fs = l.replace('\n', '').split()
                    if not fs:
                        continue
                    chrom, pos, db_id, ref, alt = fs[1:6]
                    vark = ':'.join([chrom, pos, ref, alt])
                    if vark in met_in_this_sample:
                        if suffix == variant_filtering.mut_pass_suffix:
                            total_duplicated_count += 1
                    else:
                        count_in_cohort_by_vark[vark] += 1
                        if suffix == variant_filtering.mut_pass_suffix:
                            met_in_this_sample.add(vark)
                            total_varks += 1
                    total_records_count += 1

    if suffix == variant_filtering.mut_pass_suffix:
        info('Counted ' + str(len(count_in_cohort_by_vark)) +
             ' different variants ' + 'in ' + str(len(samples)) +
             ' samples with total ' + str(total_varks) + ' records')
        info('Duplicated varks for this sample: ' +
             str(total_duplicated_count) + ' out of total ' +
             str(total_records_count) +
             ' records. Duplicated were not counted into cohort frequencies.')

    freq_in_cohort_by_vark = dict()
    max_freq = 0
    for vark, count in count_in_cohort_by_vark.items():
        f = float(count) / len(samples)
        freq_in_cohort_by_vark[vark] = f
        if f > max_freq:
            max_freq = f

    if suffix == variant_filtering.mut_pass_suffix:
        info('Maximum frequency in cohort is ' + str(max_freq))
    return freq_in_cohort_by_vark, count_in_cohort_by_vark
Пример #6
0
def get_mutations_fpaths(bcbio_structure):
    caller = bcbio_structure.variant_callers.get('vardict') or \
             bcbio_structure.variant_callers.get('vardict-java')

    _base_mut_fname = variant_filtering.mut_fname_template.format(
        caller_name=caller.name)
    _base_mut_fpath = join(bcbio_structure.date_dirpath, _base_mut_fname)
    mut_fpath = add_suffix(_base_mut_fpath, variant_filtering.mut_pass_suffix)
    single_mut_fpath = add_suffix(
        add_suffix(_base_mut_fpath, variant_filtering.mut_single_suffix),
        variant_filtering.mut_pass_suffix)
    paired_mut_fpath = add_suffix(
        add_suffix(_base_mut_fpath, variant_filtering.mut_paired_suffix),
        variant_filtering.mut_pass_suffix)
    mut_fpath = verify_file(mut_fpath, silent=True)
    single_mut_fpath = verify_file(single_mut_fpath, silent=True)
    paired_mut_fpath = verify_file(paired_mut_fpath, silent=True)
    mutations_fpaths = [
        f for f in [mut_fpath, single_mut_fpath, paired_mut_fpath] if f
    ]
    return mutations_fpaths
def evaluate_capture(cnf, bcbio_structures):
    samples = [s for bs in bcbio_structures for s in bs.samples]
    min_samples = math.ceil(cnf.min_ratio * len(samples))

    info('Filtering regions by depth')
    regions = check_regions_depth(cnf, bcbio_structures, min_samples)
    if not regions:
        err('No regions were filtered.')
        return None
    if cnf.bed or cnf.tricky_regions:
        regions = intersect_regions(cnf, bcbio_structures, regions,
                                    min_samples)

    regions_fname = 'filtered_regions.txt'
    regions_fpath = join(
        cnf.output_dir,
        add_suffix(regions_fname, str(cnf.min_depth))
        if cnf.min_depth else regions_fname)
    with open(regions_fpath, 'w') as out:
        out.write('## Minimal percent of region with low coverage: ' +
                  str((1 - cnf.min_percent) * 100) + '%\n')
        out.write(
            '## Minimal percent of samples that share the same feature: ' +
            str(cnf.min_ratio * 100) + '%\n')
        if not cnf.min_depth:
            out.write(
                '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n'
            )
        else:
            out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) +
                      'x\n')
        out.write('\t'.join([
            '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx',
            'SamplesSharingSameFeature', 'Annotation'
        ]) + '\n')
        for region in sorted(regions, key=lambda x: (x[0], int(x[1]))):
            out.write('\t'.join([str(val) for val in region]) + '\n')

    info()
    info(str(len(regions)) + ' regions were saved into ' + regions_fpath)
    bgzip_and_tabix(cnf, regions_fpath, tabix_parameters='-p bed')
    return regions_fpath
Пример #8
0
def align(cnf,
          sample,
          l_fpath,
          r_fpath,
          sambamba,
          bwa,
          bammarkduplicates,
          bwa_prefix,
          is_pcr=False):
    sam_fpath = join(cnf.work_dir, sample.name + '_downsampled.sam')
    bam_fpath = splitext(sam_fpath)[0] + '.bam'
    sorted_bam_fpath = add_suffix(bam_fpath, 'sorted')

    bwa_cmdline = '{bwa} mem {bwa_prefix} {l_fpath} {r_fpath} '.format(
        **locals())
    res = call(cnf, bwa_cmdline, output_fpath=sam_fpath, exit_on_error=False)
    if not res:
        return None

    cmdline = '{sambamba} view -t {cnf.threads} -S -f bam {sam_fpath}'.format(
        **locals())
    call(cnf, cmdline, output_fpath=bam_fpath)

    prefix = splitext(sorted_bam_fpath)[0]
    cmdline = '{sambamba} sort -t {cnf.threads} {bam_fpath} -o {sorted_bam_fpath}'.format(
        **locals())
    call(cnf,
         cmdline,
         output_fpath=sorted_bam_fpath,
         stdout_to_outputfile=False)

    if not is_pcr:
        markdup_bam_fpath = markdup_bam(cnf, sorted_bam_fpath,
                                        bammarkduplicates)
        if markdup_bam_fpath:
            sorted_bam_fpath = markdup_bam_fpath

    index_bam(cnf, sorted_bam_fpath, sambamba=sambamba)
    return sorted_bam_fpath
Пример #9
0
def sort_bed_by_alphabet(cnf,
                         input_bed_fpath,
                         output_bed_fpath=None,
                         chr_len_fpath=None):
    chr_lengths = get_chr_lengths(cnf, chr_len_fpath)
    chromosomes = set([c for (c, l) in chr_lengths])
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else add_suffix(
            input_bed_fpath, 'sorted')

    regions = defaultdict(list)

    info('Sorting regions...')
    chunk_size = 10
    chunk_counter = 0
    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    if chrom not in chromosomes:
                        continue
                    if chunk_counter == chunk_size or not regions[chrom]:
                        chunk_counter = 0
                        regions[chrom].append('')
                    regions[chrom][-1] += l
                    chunk_counter += 1
                for chr in sorted(regions.keys()):
                    for region in regions[chr]:
                        out.write(region)

    return output_bed_fpath
Пример #10
0
def markdup_sam(cnf, in_sam_fpath, samblaster=None):
    """Perform non-stream based deduplication of SAM input files using samblaster.
    """
    if not samblaster:
        samblaster = get_system_path(cnf, 'samblaster')
        if not samblaster:
            warn('No samblaster, can\'t mark duplicates.')
            return None

    out_sam_fpath = add_suffix(in_sam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_sam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format(
        **locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_sam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_sam_fpath
    else:
        return None
Пример #11
0
def downsample(cnf,
               sample_name,
               fastq_L_fpath,
               fastq_R_fpath,
               N,
               output_dir,
               suffix=None,
               quick=False):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    quick=True will just grab the first N reads rather than do a true
    downsampling
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0]

    l_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_L_fpath), suffix or 'subset'))
    r_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_R_fpath), suffix or 'subset'))
    if cnf.reuse_intermediate and verify_file(
            l_out_fpath, silent=True) and verify_file(r_out_fpath,
                                                      silent=True):
        info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.')
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    N = int(N)
    records_num = N
    if quick:
        rand_records = range(N)
    else:
        info(sample_name + ': getting number of reads in fastq...')
        records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4
        if records_num > LIMIT:
            info(sample_name + ' the number of reads is higher than ' +
                 str(LIMIT) + ', sampling from only first ' + str(LIMIT))
            records_num = LIMIT
        info(sample_name + ': ' + str(records_num) + ' reads')
        if records_num < N:
            info(sample_name + ': and it is less than ' + str(N) +
                 ', so no downsampling.')
            return fastq_L_fpath, fastq_R_fpath
        else:
            info(sample_name + ': downsampling to ' + str(N))
            rand_records = sorted(random.sample(xrange(records_num), N))

    info('Opening ' + fastq_L_fpath)
    fh1 = open_gzipsafe(fastq_L_fpath)
    info('Opening ' + fastq_R_fpath)
    fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath)

    written_records = 0
    with file_transaction(cnf.work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, basestring):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            rec_no += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no))
            if rec_no > records_num:
                info(sample_name + ' reached the limit of ' + str(records_num),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_R_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
Пример #12
0
def write_combined_results(cnf,
                           variants_fpath,
                           samples,
                           vcf2txt_fpaths,
                           freq_in_cohort_by_vark,
                           count_in_cohort_by_vark,
                           suffix=variant_filtering.mut_pass_suffix,
                           do_cohort_filtering=True):
    artefacts_samples = OrderedDefaultDict(list)
    artefacts_data = OrderedDict()

    variants_count = defaultdict(int)
    written_lines_count = 0
    status_col, reason_col, n_samples_col, n_var_col, pcnt_sample_col, ave_af_col, incidentalome_col \
        = None, None, None, None, None, None, None

    with file_transaction(cnf.work_dir, variants_fpath) as tx:
        with open(tx, 'w') as out:
            for sample_i, (sample, vcf2txt_fpath) in enumerate(
                    zip(samples, vcf2txt_fpaths)):
                mut_fpath = add_suffix(vcf2txt_fpath, suffix)
                with file_transaction(cnf.work_dir,
                                      mut_fpath) as fixed_mut_fpath_tx:
                    with open(mut_fpath) as f, open(fixed_mut_fpath_tx,
                                                    'w') as fixed_f_out:
                        for line_i, l in enumerate(f):
                            fs = l.replace('\n', '').split('\t')
                            if line_i == 0 and sample_i == 0:
                                out.write(l)
                            if line_i == 0:
                                fixed_f_out.write(l)
                                if status_col is not None and status_col != fs.index(
                                        'Significance'):
                                    critical(
                                        'Different format in ' + mut_fpath +
                                        ': status_col=' +
                                        str(fs.index('Significance')) +
                                        ', but the first sample was ' +
                                        str(status_col) +
                                        ', please rerun VarFilter from the beginning'
                                    )
                                status_col = fs.index('Significance')
                                reason_col = status_col + 1
                                n_samples_col = fs.index('N_samples')
                                n_var_col = fs.index('N_Var')
                                pcnt_sample_col = fs.index('Pcnt_sample')
                                ave_af_col = fs.index('Ave_AF')
                                if 'Incidentalome' in fs:
                                    incidentalome_col = fs.index(
                                        'Incidentalome')
                            if line_i > 0:
                                fs = l.replace('\n', '').split('\t')
                                chrom, pos, db_id, ref, alt = fs[1:6]
                                vark = ':'.join([chrom, pos, ref, alt])
                                assert len(fs) > reason_col, 'len(fs)=' + str(len(fs)) + ' > reason_col=' + str(reason_col) + \
                                                             ' in ' + sample.name + ', ' + vcf2txt_fpath + ' for line\n' + l

                                freq = freq_in_cohort_by_vark[vark]
                                cnt = count_in_cohort_by_vark[vark]
                                fs[n_samples_col] = str(len(samples))
                                fs[n_var_col] = str(cnt)
                                fs[pcnt_sample_col] = str(freq)
                                fs[ave_af_col] = ''
                                l = '\t'.join(fs) + '\n'

                                if do_cohort_filtering:
                                    if fs[status_col] in ['known', 'likely']:
                                        variants_count['not_filtered'] += 1
                                    elif freq >= cnf.variant_filtering.max_ratio and cnt > cnf.variant_filtering.max_sample_cnt:
                                        artefacts_samples[vark].append(
                                            sample.name)
                                        # if incidentalome_col:
                                        #     fs.remove(fs[incidentalome_col])
                                        artefacts_data[vark] = fs
                                        continue
                                variants_count['good_freq'] += 1
                                fixed_f_out.write(l)
                                out.write(l)
                                written_lines_count += 1
    return artefacts_samples, artefacts_data, variants_count, written_lines_count
def convert_vardict_txts_to_bcbio_vcfs(cnf,
                                       bs,
                                       sample,
                                       output_dir=None,
                                       pass_only=False):
    info('')
    info('Preparing data for ' + sample.name)
    anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name)
    if not anno_filt_vcf_fpath:
        return None, None

    if not output_dir:
        output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath)
    output_vcf_fpath = join(
        output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending)
    pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass')
    if cnf.reuse_intermediate and verify_vcf(
            output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath +
                                                     '.gz'):
        info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath +
             '.gz exists, reusing')
        return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz'

    info('Parsing PASS and REJECT mutations...')
    pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts(
        cnf, bs, sample, pass_only=pass_only)
    sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict)

    info('')
    info('Writing VCFs')
    vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r'))
    vcf_reader = add_keys_to_header(vcf_reader, filter_values)
    with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \
        file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx:
        vcf_writer = None
        if not pass_only:
            vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader)
        vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader)
        for key, mut in sorted_mut_dict.items():
            record = get_record_from_vcf(vcf_reader, mut)
            if record:
                if key in pass_mut_dict:
                    record.FILTER = ['PASS']
                    if mut.reason:
                        record.INFO['Reason'] = mut.reason.replace(' ', '_')
                elif pass_only:
                    continue
                elif key in reject_mut_dict:
                    if not mut.reason:
                        continue
                    reject_reason_ids = [
                        filter_descriptions_dict[reason]
                        if reason in filter_descriptions_dict else reason
                        for reason in mut.reason.split(' and ')
                    ]
                    record.FILTER = [';'.join(reject_reason_ids)]
                if mut.signif:
                    record.INFO['Signif'] = mut.signif
                if mut.status:
                    record.INFO['Status'] = mut.status
                if vcf_writer:
                    vcf_writer.write_record(record)
                if key in pass_mut_dict:
                    vcf_pass_writer.write_record(record)
            else:
                warn('No record was found in ' + anno_filt_vcf_fpath +
                     ' for mutation ' + str(mut))

    output_gzipped_vcf_fpath = None
    if vcf_writer:
        vcf_writer.close()
        output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath)
        info('VCF file for vardict.txt is saved to ' +
             output_gzipped_vcf_fpath)
    vcf_pass_writer.close()
    output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath)
    info('VCF file for vardict.PASS.txt is saved to ' +
         output_gzipped_pass_vcf_fpath)
    return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath
Пример #14
0
def combine_results(cnf,
                    samples,
                    vcf2txt_fpaths,
                    variants_fpath,
                    pass_variants_fpath=None,
                    reject_variants_fpath=None):
    info('Combining vcf2txt variants')
    not_existing_snames = []
    if cnf.reuse_intermediate and isfile(variants_fpath) and verify_file(
            variants_fpath):
        info('Combined filtered results ' + variants_fpath +
             ' exist, reusing.')
    else:
        for sample_i, (sample,
                       vcf2txt_fpath) in enumerate(zip(samples,
                                                       vcf2txt_fpaths)):
            if not verify_file(vcf2txt_fpath, description='variants file'):
                not_existing_snames.append(sample.name)
        if not_existing_snames:
            critical(
                'For some samples do not exist, variants file was not found: '
                + ', '.join(not_existing_snames))
        with file_transaction(cnf.work_dir, variants_fpath) as tx:
            with open(tx, 'w') as out:
                for sample_i, (sample, vcf2txt_fpath) in enumerate(
                        zip(samples, vcf2txt_fpaths)):
                    with open(vcf2txt_fpath) as f:
                        for line_i, l in enumerate(f):
                            if line_i == 0 and sample_i == 0:
                                out.write(l)
                            if line_i > 0:
                                out.write(l)
        verify_file(variants_fpath,
                    is_critical=True,
                    description='combined mutation calls')
        info('Saved vcf2txt variants to ' + variants_fpath)

    info()
    info('Combining PASSed mutations')
    pass_variants_fpath = pass_variants_fpath or add_suffix(
        variants_fpath, variant_filtering.mut_pass_suffix)
    reject_variants_fpath = reject_variants_fpath or add_suffix(
        variants_fpath, variant_filtering.mut_reject_suffix)
    not_existing_pass_snames = []
    if cnf.reuse_intermediate and isfile(pass_variants_fpath) and verify_file(pass_variants_fpath)\
            and isfile(reject_variants_fpath) and verify_file(reject_variants_fpath):
        info('Combined PASSed filtered results ' + pass_variants_fpath +
             ' exist, reusing.')
    else:
        for sample_i, (sample,
                       vcf2txt_fpath) in enumerate(zip(samples,
                                                       vcf2txt_fpaths)):
            if not verify_file(add_suffix(vcf2txt_fpath,
                                          variant_filtering.mut_pass_suffix),
                               description='PASS variants file'):
                not_existing_pass_snames.append(sample.name)
        if not_existing_pass_snames:
            critical(
                'For some samples do not exist, PASS variants file was not found: '
                + ', '.join(not_existing_pass_snames))
        info('*' * 70)
        if cnf.variant_filtering.max_ratio < 1.0:
            info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio))
        else:
            info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio) +
                 ', i.e. no filter')

        info('Calculating frequences of variants in the cohort')
        info('*' * 70)
        freq_in_cohort_by_vark, count_in_cohort_by_vark = count_mutations_freq(
            cnf, samples, vcf2txt_fpaths)
        reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark = count_mutations_freq(
            cnf,
            samples,
            vcf2txt_fpaths,
            suffix=variant_filtering.mut_reject_suffix)
        info()

        if cnf.variant_filtering.max_ratio < 1.0:
            info('Saving passing threshold if cohort freq < ' +
                 str(cnf.variant_filtering.max_ratio) + ' to ' +
                 pass_variants_fpath)

        artefacts_samples, artefacts_data, variants_count, written_lines_count = write_combined_results(
            cnf,
            pass_variants_fpath,
            samples,
            vcf2txt_fpaths,
            freq_in_cohort_by_vark,
            count_in_cohort_by_vark,
            suffix=variant_filtering.mut_pass_suffix,
            do_cohort_filtering=True)

        _, _, _, reject_written_lines_count = write_combined_results(
            cnf,
            reject_variants_fpath,
            samples,
            vcf2txt_fpaths,
            reject_freq_in_cohort_by_vark,
            reject_count_in_cohort_by_vark,
            suffix=variant_filtering.mut_reject_suffix,
            do_cohort_filtering=False)

        if len(artefacts_samples.keys()) > 0:
            reason = 'cohort freq > ' + str(cnf.variant_filtering.max_ratio)
            with open(reject_variants_fpath) as f:
                line = f.readline().split()
                reason_col = line.index('Reason') if 'Reason' in line else None
            with open(reject_variants_fpath, 'a') as f:
                for vark, samples in artefacts_samples.items():
                    fs = artefacts_data[vark]
                    if reason_col:
                        fs[reason_col] = reason
                    else:
                        fs.append(reason)
                    f.write('\t'.join(fs) + '\n')

            info('Skipped artefacts with cohort freq > ' +
                 str(cnf.variant_filtering.max_ratio) +
                 ' and sample count > ' +
                 str(cnf.variant_filtering.max_sample_cnt) + ': ' +
                 str(len(artefacts_samples.keys())))
            info('Added artefacts into ' + reject_variants_fpath)

        info('All variants not under filtering: ' +
             str(variants_count['not_filtered']))
        if len(artefacts_samples.keys()) > 0:
            info('Variants not under filtering with freq > ' +
                 str(cnf.variant_filtering.max_ratio) + ': ' +
                 str(variants_count['good_freq']))

        verify_file(pass_variants_fpath,
                    'PASS variants file',
                    is_critical=True)
        info('Written ' + str(written_lines_count) + ' records to ' +
             pass_variants_fpath)
        info('Written ' +
             str(reject_written_lines_count + len(artefacts_samples.keys())) +
             ' rejected records to ' + reject_variants_fpath)

        variants_fpath = verify_file(variants_fpath, is_critical=True)
        pass_variants_fpath = verify_file(pass_variants_fpath,
                                          is_critical=True)

        if not_existing_snames or not_existing_pass_snames:
            return None, None

    return variants_fpath, pass_variants_fpath
Пример #15
0
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to filter')
             ),
            (['--vcf2txt'], dict(
                dest='vcf2txt',
                help='variants in vcf2txt to filter')
             ),
            (['--cohort-freqs'], dict(
                dest='cohort_freqs_fpath',
                help='frequencies of variants in a cohort')
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
            (['--no-tsv'], dict(
                dest='tsv',
                action='store_false',
                default=True,
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varfilter_name + '_post')

    check_system_resources(cnf, required=['perl'])
    check_genome_resources(cnf)

    if not cnf.output_file:
        cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt')

    safe_mkdir(dirname(cnf.output_file))
    safe_mkdir(cnf.output_dir)

    if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'):
        verify_vcf(cnf.vcf, is_critical=True)

    if not cnf.vcf2txt:
        vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file)
        if not vcf2txt_res_fpath:
            critical('vcf2txt run returned non-0')
        info('Saved vcf2txt output to ' + vcf2txt_res_fpath)
    else:
        cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True)
        info('Input is vcf2txt output, grepping by sample name ' + cnf.sample)
        vcf2txt_res_fpath = cnf.output_file
        with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx:
            with open(cnf.vcf2txt) as f, open(tx, 'w') as out:
                for i, l in enumerate(f):
                    if l.strip():
                        if i == 0:
                            out.write(l)
                        else:
                            if l.split('\t')[0] == cnf.sample:
                                out.write(l)
        info('Using vcf2txt from ' + vcf2txt_res_fpath)

    # if is_local():
    #     vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl'))
    #     info('Running vardict2mut perl')
    #     res = run_vardict2mut(cnf, vcf2txt_res_fpath,
    #         add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'),
    #         vardict2mut_executable=vardict2mut_pl)
    #     if not res:
    #         critical('vardict2mut.pl run returned non-0')

    mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix))
    if not mut_fpath:
        err('vardict2mut failed')
    else:
        info('Saved passed mutations to ' + mut_fpath)

        var_s = source.VarSample(cnf.sample, cnf.output_dir)
        var_s.anno_vcf_fpath = cnf.vcf
        var_s.varfilter_dirpath = var_s.dirpath

        ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0]
        ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt'))
        var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz'

        var_s.variants_fpath = vcf2txt_res_fpath
        var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix)

        ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass')
        var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass')

        filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath)
        index_vcf(cnf, var_s.name, filt_vcf, cnf.caller)
        index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller)

        if cnf.qc:
            report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s)
            qc_dirpath = join(cnf.output_dir, 'qc')
            safe_mkdir(qc_dirpath)
            qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name)
            info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
            info('-' * 70)
            info()

        if not cnf['keep_intermediate']:
            shutil.rmtree(cnf['work_dir'])

        info()
        info('*' * 70)
        info('Done filtering ' + var_s.name)
Пример #16
0
def run_vardict2mut(cnf,
                    vcf2txt_res_fpath,
                    vardict2mut_res_fpath=None,
                    vardict2mut_executable=None):
    cmdline = None
    if vardict2mut_res_fpath is None:
        vardict2mut_res_fpath = add_suffix(vcf2txt_res_fpath,
                                           variant_filtering.mut_pass_suffix)
    vardict2mut_reject_fpath = add_suffix(vcf2txt_res_fpath,
                                          variant_filtering.mut_reject_suffix)

    check_filtering_results(vardict2mut_res_fpath)

    if not vardict2mut_executable:
        # vardict2mut_executable = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
        vardict2mut_executable = 'vardict2mut'

    c = cnf.variant_filtering

    cmdline = '{vardict2mut_executable} {vcf2txt_res_fpath} '
    if vardict2mut_executable.endswith('.pl'):
        cmdline += ' --report_reason '
        if c.min_hotspot_freq is not None and c.min_hotspot_freq != 'default':
            cmdline += ' -F ' + str(c.min_hotspot_freq)
        if c.max_ratio_vardict2mut is not None:
            cmdline += ' -R ' + str(c.max_ratio_vardict2mut)
        if cnf.genome.filter_common_snp:
            cmdline += ' --filter_common_snp {cnf.genome.filter_common_snp} '
        if cnf.genome.filter_common_artifacts:
            cmdline += ' --filter_common_artifacts {cnf.genome.filter_common_artifacts} '
        if cnf.genome.actionable:
            cmdline += ' --actionable {cnf.genome.actionable} '
        if cnf.genome.compendia_ms7_hotspot:
            cmdline += ' --compendia_ms7_hotspot {cnf.genome.compendia_ms7_hotspot} '
        if cnf.snpeffect_export_polymorphic:
            cmdline += ' --snpeffect_export_polymorphic {cnf.snpeffect_export_polymorphic} '
        if cnf.actionable_hotspot:
            cmdline += ' --actionable_hotspot {cnf.actionable_hotspot} '
        if cnf.ruledir: cmdline += ' --ruledir {cnf.ruledir} '
        cmdline = cmdline.format(**locals())
        res = call(cnf, cmdline, vardict2mut_res_fpath, exit_on_error=False)

    else:
        filt_yaml_fpath = join(cnf.work_dir, 'filt_cnf.yaml')
        info('Writing filtering yaml into ' + filt_yaml_fpath)
        with file_transaction(cnf.work_dir, filt_yaml_fpath) as tx, open(
                filt_yaml_fpath, 'w') as out:
            with open(cnf.run_cnf) as run_cnf:
                lines = []
                met_variant_filtering = False
                for l in run_cnf:
                    if l.startswith('variant_filtering:'):
                        met_variant_filtering = True
                        continue
                    if met_variant_filtering:
                        if l.startswith(' '):
                            out.write(l.lstrip())
                        else:
                            break

        cmdline += ' --filt-cnf ' + filt_yaml_fpath
        cmdline += ' --work-dir ' + cnf.work_dir
        cmdline += (' --debug ' if cnf.debug else '')
        cmdline += ' --genome ' + cnf.genome.name
        cmdline += ' -o ' + vardict2mut_res_fpath
        cmdline += ' --o-reject ' + vardict2mut_reject_fpath

        if cnf.cohort_freqs_fpath:
            cmdline += ' --cohort-freqs ' + cnf.cohort_freqs_fpath

        cmdline = cmdline.format(**locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vardict2mut_res_fpath,
                   stdout_to_outputfile=False)

    if not res:
        return None
    else:
        return res
Пример #17
0
def _filter(cnf, samples, variants_fpath, variants_fname):
    # if cohort_mode:
    #     info('Running vcf2txt.pl in cohort mode')
    #     vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True)
    #     vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples}
    #     cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample)
    #     res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath)
    #     if not res:
    #         critical('Error: vcf2txt.pl crashed')

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    cohort_freqs_fpath = None
    # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0:
    #     cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv')
    #     info('*' * 70)
    #     info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort')
    #     # cnf.variant_filtering.max_ratio < 1.0 or \
    #     # cnf.fraction < 1.0
    #     cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut)
    #     info('*' * 70)
    # info()

    not_submitted_samples = samples
    while not_submitted_samples:
        reused_samples = []
        jobs_to_wait = []
        submitted_samples = []
        for sample in not_submitted_samples:
            output_dirpath = sample.varfilter_dirpath = join(
                sample.dirpath, source.varfilter_name)
            output_fpath = sample.variants_fpath = join(
                sample.varfilter_dirpath, variants_fname)
            pass_output_fpath = add_suffix(sample.variants_fpath,
                                           variant_filtering.mut_pass_suffix)

            if cnf.reuse_intermediate and check_filtering_results(output_fpath) \
                    and check_filtering_results(pass_output_fpath):
                info('Filtered results ' + output_fpath + ' and ' +
                     pass_output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            varfilter_py = 'varfilter'
            work_dir = join(cnf.work_dir, 'filt_' + sample.name)
            if not cnf.genome.dbsnp_multi_mafs:
                critical(
                    'Error: dbsnp_multi_mafs is not specified in the config ' +
                    cnf.sys_cnf)
            cmdl = (
                '{varfilter_py}' +
                ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') +
                ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') +
                ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') +
                ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' +
                ' -o {output_dirpath}' +
                ' --output-file {sample.variants_fpath}' + ' --project-name ' +
                cnf.project_name + ' --genome {cnf.genome.name}' +
                ' --work-dir {work_dir}' + ' --debug ' +
                (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath
                 else '') + (' --reuse ' if cnf.reuse_intermediate else '') +
                ((' --caller ' + cnf.caller) if cnf.caller else '') +
                (' --qc' if cnf.qc else ' --no-qc') +
                (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' +
                adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals())
            with with_cnf(cnf, reuse_intermediate=False):
                j = submit_job(cnf,
                               cmdl,
                               job_name='_filt_' + sample.name,
                               output_fpath=pass_output_fpath,
                               stdout_to_outputfile=False,
                               work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]
                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No filtering jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed and not cnf.debug:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but ' + j.work_dir + ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]
    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()

    info('Combining results...')
    vcf2txt_fpaths = [s.variants_fpath for s in samples]
    variants_fpath, pass_variants_fpath = combine_results(
        cnf, samples, vcf2txt_fpaths, variants_fpath)

    if cnf.qc:
        _summarize_varqc(cnf,
                         cnf.output_dir,
                         samples,
                         cnf.project_name,
                         post_filter=True)

    return variants_fpath, pass_variants_fpath
Пример #18
0
def _annotate(cnf, samples):
    varannotate_cmdl = (get_script_cmdline(
        cnf, 'python', join('scripts', 'post', 'varannotate.py')) +
                        ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' +
                        cnf.run_cnf + ' --project-name ' + cnf.project_name +
                        (' --reuse ' if cnf.reuse_intermediate else '') +
                        ' --log-dir -' + ' --genome ' + cnf.genome.name +
                        (' --no-check ' if cnf.no_check else '') +
                        (' --qc' if cnf.qc else ' --no-qc') +
                        ((' --caller ' + cnf.caller) if cnf.caller else ''))

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for sample in not_submitted_samples:
            if not sample.varannotate_dirpath:
                sample.varannotate_dirpath = join(sample.dirpath,
                                                  source.varannotate_name)
            if not sample.anno_vcf_fpath:
                sample.anno_vcf_fpath = join(
                    sample.varannotate_dirpath,
                    add_suffix(basename(sample.vcf), 'anno'))
            output_fpath = sample.anno_vcf_fpath
            if not output_fpath.endswith('.gz'):
                output_fpath += '.gz'
            debug('Checking ' + output_fpath)
            if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf(
                    output_fpath):
                info('Annotated results ' + output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            work_dir = join(cnf.work_dir,
                            source.varannotate_name + '_' + sample.name)
            j = submit_job(
                cnf,
                cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' +
                sample.varannotate_dirpath + ' -s ' + sample.name +
                ' --work-dir ' + work_dir + ' --output-file ' + output_fpath,
                job_name='VA_' + cnf.project_name + '_' + sample.name,
                output_fpath=output_fpath,
                stdout_to_outputfile=False,
                work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]

                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_vcf(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but j.work_dir ' + j.work_dir +
                        ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()
def __intermediate_fname(work_dir, fname, suf):
    output_fname = add_suffix(fname, suf)
    return join(work_dir, basename(output_fname))
Пример #20
0
def generate_flagged_regions_report(cnf, output_dir, sample, ave_depth,
                                    gene_by_key):
    depth_threshs = cnf.coverage_reports.depth_thresholds
    report = PerRegionSampleReport(
        sample=sample,
        metric_storage=get_detailed_metric_storage(depth_threshs))
    report.add_record('Sample', sample.name)
    safe_mkdir(sample.flagged_regions_dirpath)
    ''' 1. Detect depth threshold (ave sample coverage * DEPTH_THRESH_FROM_AVE_COV)
        2. Select regions covered in less than MIN_DEPTH_PERCENT_AT_THRESH at threshold
        3. Sort by % at threshold
        4. Select those parts of those regions where % = 0, save to BED
        5. Find HotSpots at those regions
        6. Intersect HotSpots with tracks

        For each gene where are regions with parts % = 0:
            sort them by part where % = 0
    '''
    #vcf_dbs = ['oncomine', 'dbsnp', 'cosmic']
    vcf_dbs = ['oncomine']

    from source._deprecated_clinical_reporting.clinical_parser import get_key_or_target_bed_genes
    key_genes, _ = get_key_or_target_bed_genes(
        cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes'))
    depth_cutoff = get_depth_cutoff(ave_depth, depth_threshs)
    genes_sorted = sorted(gene_by_key.values())
    min_cov, max_cov = min_and_max_based_on_outliers(genes_sorted)

    for coverage_type in ['low', 'high']:
        info('Selecting and saving ' + coverage_type + ' covered genes')
        selected_genes = []

        if coverage_type == 'low':
            selected_genes = [
                g for g in genes_sorted if g.gene_name in key_genes and (any(
                    e.rates_within_threshs[depth_cutoff] <
                    MIN_DEPTH_PERCENT_AT_THRESH for e in g.get_exons()) or any(
                        a.rates_within_threshs[depth_cutoff] <
                        MIN_DEPTH_PERCENT_AT_THRESH
                        for a in g.get_amplicons()))
            ]
        else:
            if max_cov:
                selected_genes = [
                    g for g in genes_sorted
                    if g.gene_name in key_genes and (any(
                        e.avg_depth > max_cov for e in g.get_exons()) or any(
                            a.avg_depth > max_cov for a in g.get_amplicons()))
                ]
        for region_type in ['exons', 'target']:
            selected_regions = []
            for gene in selected_genes:
                if coverage_type == 'low':
                    cur_regions = [
                        a for a in (gene.get_amplicons() if region_type ==
                                    'target' else gene.get_exons())
                        if a.rates_within_threshs[depth_cutoff] <
                        MIN_DEPTH_PERCENT_AT_THRESH
                        and 'Multi' not in a.feature
                    ]
                else:
                    cur_regions = [
                        a for a in (gene.get_amplicons() if region_type ==
                                    'target' else gene.get_exons())
                        if a.avg_depth > max_cov and 'Multi' not in a.feature
                    ]
                selected_regions.extend(cur_regions)

            if selected_regions:
                selected_regions_bed_fpath = join(
                    sample.flagged_regions_dirpath,
                    coverage_type + '_cov_' + region_type + '.bed')
                save_regions_to_bed(cnf, selected_regions,
                                    selected_regions_bed_fpath)

                # Report cov for Hotspots
                for db in vcf_dbs:
                    res = _report_normalize_coverage_for_variant_sites(
                        cnf, sample, ave_depth, db, selected_regions_bed_fpath,
                        selected_regions, depth_cutoff, region_type,
                        coverage_type)
                    if not res:
                        return None

            report = make_flat_region_report(sample, selected_regions,
                                             depth_threshs)
            flagged_txt_fpath = add_suffix(
                add_suffix(sample.flagged_txt, region_type), coverage_type)
            flagged_tsv_fpath = add_suffix(
                add_suffix(sample.flagged_tsv, region_type), coverage_type)
            report.save_txt(flagged_txt_fpath)
            report.save_tsv(flagged_tsv_fpath)

            info('')
            info(coverage_type + ' covered ' + region_type + '(total ' +
                 str(len(selected_regions)) + ') for sample ' + sample.name +
                 ' saved into:')
            info('  ' + flagged_txt_fpath + ', ' + flagged_tsv_fpath)

    return report
Пример #21
0
def __intermediate_fname(work_dir, fname, suf):
    output_fname = add_suffix(fname, suf)
    return join(work_dir, basename(output_fname))
def intersect_regions(cnf, bcbio_structures, all_regions, min_samples):
    all_regions_fname = 'all_regions.bed'
    all_regions_bed_fpath = join(
        cnf.output_dir,
        add_suffix(all_regions_fname, str(cnf.min_depth))
        if cnf.min_depth else all_regions_fname)

    with open(all_regions_bed_fpath, 'w') as out:
        if not cnf.min_depth:
            out.write(
                '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n'
            )
        else:
            out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) +
                      'x\n')
        out.write('\t'.join([
            '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx',
            'SamplesSharingSameFeature'
        ]) + '\n')
        for region in all_regions:
            out.write('\t'.join([str(val) for val in region]) + '\n')

    regions_overlaps = defaultdict(lambda: defaultdict(list))
    regions = []
    if cnf.tricky_regions:
        intersection_fpath = _intersect_with_tricky_regions(
            cnf, all_regions_bed_fpath, 'samples')
    else:
        bed_fpath = cnf.bed
        intersection_fpath = join(
            cnf.work_dir,
            splitext(basename(all_regions_bed_fpath))[0] + '_bed.intersect')
        bedtools = get_system_path(cnf, 'bedtools')
        if not cnf.reuse_intermediate or not verify_file(
                intersection_fpath, silent=True, is_critical=False):
            cmdline = '{bedtools} intersect -header -a {all_regions_bed_fpath} -b {bed_fpath} -wo'.format(
                **locals())
            res = call(cnf,
                       cmdline,
                       output_fpath=intersection_fpath,
                       max_number_of_tries=1,
                       exit_on_error=False)
            if not res:
                return None

    with open(intersection_fpath) as f:
        for l in f:
            l = l.strip()
            if not l or l.startswith('#'):
                continue
            fs = l.split('\t')
            chrom, start, end, size, symbol, pct_depth, num_samples = fs[:7]
            overlap_bps = int(fs[-1])
            r = (chrom, start, end, size, symbol, pct_depth, num_samples)
            if cnf.tricky_regions:
                filename = tricky_regions_fnames_d[basename(
                    fs[7]).split('.')[0]]
                regions_overlaps[r][filename].append(overlap_bps)
            else:
                regions_overlaps[r][basename(cnf.bed)].append(overlap_bps)
    for r in all_regions:
        if r in regions_overlaps:
            overlaps = ''
            chrom, start, end, size, symbol, pct_depth, num_samples = r
            overlaps_txt = ', '.join(
                fname + ': %.0f' %
                (sum(regions_overlaps[r][fname]) / float(size) * 100) + '%'
                for fname in regions_overlaps[r])
            r = list(r)
            r.append(overlaps_txt)
        else:
            r = list(r)
            r.append('')
        regions.append(r)
    os.remove(intersection_fpath)
    return regions
Пример #23
0
def combine_projects(cnf, bcbio_structures, tags=None):
    tag_by_sample = dict()
    if tags:
        for bs, tag in zip(bcbio_structures, tags):
            for s in bs.samples:
                tag_by_sample[s.name] = tag or bs.project_name
    # else:
    #     for bs in bcbio_structures:
    #         for s in bs.sampels:
    #             tag_by_sample[s.name] = bs.project_name

    final_dirpath = adjust_path(join(cnf.output_dir, 'final'))
    safe_mkdir(final_dirpath)

    merged_bcbio_cnf = merge_bcbio_yamls(cnf, bcbio_structures)

    samples = [s for bs in bcbio_structures for s in bs.samples]
    dirs_to_reprocess = [
        source.clinreport_dir, BCBioStructure.var_dir, source.varannotate_name,
        source.varfilter_name
    ]
    for s in samples:
        sample_dir = join(final_dirpath, s.name)
        sample_var_dirpath = join(sample_dir, BCBioStructure.var_dir)
        safe_mkdir(sample_var_dirpath)
        for file_or_dir in os.listdir(s.dirpath):
            if file_or_dir not in dirs_to_reprocess:
                safe_symlink_to(join(s.dirpath, file_or_dir), sample_dir)
        for file in os.listdir(s.var_dirpath):
            safe_symlink_to(join(s.var_dirpath, file), sample_var_dirpath)

    merged_date_dir = join(
        final_dirpath,
        merged_bcbio_cnf['fc_date'] + '_' + merged_bcbio_cnf['fc_name'])
    merged_bs_var_dirpath = join(merged_date_dir, BCBioStructure.var_dir)
    merged_bs_raw_var_dirpath = join(merged_bs_var_dirpath, 'raw')
    safe_mkdir(merged_bs_raw_var_dirpath)
    for bs in bcbio_structures:
        for file in os.listdir(bs.raw_var_dirpath):
            safe_symlink_to(join(bs.raw_var_dirpath, file),
                            merged_bs_raw_var_dirpath)

    variants_fpaths = []
    vardict_txt_fname = variant_filtering.mut_fname_template.format(
        caller_name='vardict')
    variants_fpath = join(merged_bs_var_dirpath, vardict_txt_fname)
    pass_variants_fpath = add_suffix(variants_fpath,
                                     variant_filtering.mut_pass_suffix)
    reject_variants_fpath = add_suffix(variants_fpath,
                                       variant_filtering.mut_reject_suffix)

    cnf.steps = ['Variants']

    for bs_i, bs in enumerate(
            bcbio_structures
    ):  # re-filtering, perform cohort-based filtering only within sub-projects
        correct_bs = BCBioStructure(cnf, cnf.output_dir, bs.bcbio_cnf,
                                    final_dirpath)
        bcbio_runner = BCBioRunner(cnf, correct_bs, bs.bcbio_cnf)
        bcbio_runner.post_jobs()
        bs_raw_variants_fpath = add_suffix(variants_fpath, str(bs_i))
        pass_bs_variants_fpath = add_suffix(bs_raw_variants_fpath,
                                            variant_filtering.mut_pass_suffix)
        reject_bs_variants_fpath = add_suffix(
            bs_raw_variants_fpath, variant_filtering.mut_reject_suffix)
        shutil.move(variants_fpath, bs_raw_variants_fpath)
        shutil.move(pass_variants_fpath, pass_bs_variants_fpath)
        shutil.move(reject_variants_fpath, reject_bs_variants_fpath)
        variants_fpaths.append(bs_raw_variants_fpath)

    merged_bs = BCBioStructure(cnf, cnf.output_dir, merged_bcbio_cnf,
                               final_dirpath)
    merged_samples = [s for s in merged_bs.samples]

    cnf.variant_filtering.max_ratio = 1
    combine_results(cnf,
                    merged_samples,
                    variants_fpaths,
                    variants_fpath,
                    pass_variants_fpath=pass_variants_fpath)
    for variants_fpath in variants_fpaths:
        safe_remove(variants_fpath)
        pass_fpath = add_suffix(variants_fpath,
                                variant_filtering.mut_pass_suffix)
        safe_remove(pass_fpath)
        reject_fpath = add_suffix(variants_fpath,
                                  variant_filtering.mut_reject_suffix)
        safe_remove(reject_fpath)

    cnf.reuse_intermediate = True
    cnf.steps = ['Seq2C', 'Summary']
    BCBioRunner(cnf, merged_bs, merged_bs.bcbio_cnf).post_jobs()
Пример #24
0
def main():
    if len(sys.argv) < 4:
        info(
            'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.'
        )
        # info('When the gene name is found in HGNC, it get replaced with an approved name.         ')
        # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.  ')
        info(
            '                                                                                      '
        )
        info(
            'Usage:                                                                                '
        )
        info('    ' + __file__ +
             ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path +
             '] [additional_feature_list]')
        info(
            '                                                                                      '
        )
        info(
            '     where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:'
        )
        info(
            '     #Approved Symbol  Previous Symbols                    Synonyms                          Chromosome   Ensembl Gene ID   UCSC ID(supplied by UCSC)'
        )
        info(
            '     OR7E26P           OR7E67P, OR7E69P, OR7E70P, OR7E68P  OR1-51, OR1-72, OR1-73, OR912-95  19q13.43	    ENSG00000121410   uc002qsg.3'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz'
        )
        info(
            '     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";'
        )
        info(
            '     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz'
        )
        info(
            '     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA'
        )
        info(
            '     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true'
        )
        info(
            '     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:'
        )
        info(
            '     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol'
        )
        info(
            '     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     Writes to Exons.bed                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'
        )
        sys.exit(1)

    genome_name = sys.argv[1]
    seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath
    canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    input_fpath = verify_file(sys.argv[2])
    output_fpath = adjust_path(sys.argv[3])

    synonyms_fpath = None
    if len(sys.argv) > 4:
        synonyms_fpath = verify_file(sys.argv[4])
        info('Synonyms file provided ' + synonyms_fpath + '')
    else:
        info('No synonyms file provided, skipping approving')

    not_approved_fpath = None
    if len(sys.argv) > 5:
        not_approved_fpath = adjust_path(sys.argv[5])

    with open(verify_file(canonical_transcripts_fpath)) as f:
        canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f)

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        l = inp.readline()
        if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath,
                                                       chr_order)
        elif output_fpath.endswith('.gff3') or output_fpath.endswith(
                '.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath,
                                                       chr_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order)

    if synonyms_fpath and synonyms_fpath != "''":
        gene_by_name_and_chrom, not_approved_gene_names = _approve(
            gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) +
             ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

        # with open('serialized_genes.txt', 'w') as f:
        #     for g in gene_by_name.values():
        #         f.write(str(g) + '\t' + str(g.db_id) + '\n')
        #         for e in g.exons:
        #             f.write('\t' + str(e) + '\n')

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_and_mirna_genes = [
        g for g in genes
        if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts)
    ]

    coding_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'protein_coding' for t in g.transcripts)
    ]
    coding_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'protein_coding'
    ]
    mirna_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA' for t in g.transcripts)
    ]
    mirna_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'miRNA'
    ]
    codingmiRNA_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA'
               for t in g.transcripts) and any(t.biotype == 'protein_coding'
                                               for t in g.transcripts)
    ]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(mirna_genes)) + ' miRNA genes')
    info('  ' + str(len(mirna_transcripts)) + ' miRNA transcripts')
    info('  ' + str(len(codingmiRNA_genes)) +
         ' genes with both coding and miRNA transcripts')

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    print_genes(genes, output_fpath, canon_only=False)

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = add_suffix(output_fpath, 'canon')
    print_genes(canon_genes, canon_output_fpath, canon_only=True)

    info()
    info('Saved all regions to\n   ' + output_fpath + '\n   ' +
         canon_output_fpath)
Пример #25
0
def _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples,
                                             mutations, key_or_target_genes):
    region_types = ['exons', 'target']
    coverage_types = ['low', 'high']
    flagged_regions_metrics = [
        Metric('Gene', min_width=50, max_width=70),
        Metric('Chr', with_heatmap=False, max_width=20, align='right'),
        Metric('Position', td_class='td_position', min_width=70,
               max_width=120),
        Metric('Ave depth',
               td_class='long_expanded_line right_aligned',
               max_width=100,
               with_heatmap=False),
        Metric('#HS', quality='Less is better', align='right', max_width=30),
        Metric('Hotspots & Deleterious',
               td_class='long_expanded_line',
               min_width=100,
               max_width=150),
        Metric('Found mutations',
               td_class='long_expanded_line',
               min_width=150,
               max_width=200),
        Metric('Samples',
               td_class='long_expanded_line',
               min_width=100,
               max_width=120),
        Metric('Possible reasons',
               td_class='long_expanded_line',
               max_width=120)
    ]
    flagged_regions_metric_storage = MetricStorage(
        sections=[ReportSection(metrics=flagged_regions_metrics)])
    flagged_regions_report_dirpath = bcbio_structure.flagged_regions_dirpath
    safe_mkdir(flagged_regions_report_dirpath)
    if key_or_target_genes == 'target':
        genes_description = 'genes'
    else:
        genes_description = 'genes that have been previously implicated in various cancers'
    for region_type in region_types:
        regions_dict = {}
        total_regions = 0
        info()
        info('Preparing report for ' + region_type)
        for coverage_type in coverage_types:
            regions_by_gene = {}
            for sample in samples:
                selected_regions_bed_fpath = join(
                    sample.flagged_regions_dirpath,
                    coverage_type + '_cov_' + region_type + '.bed')
                regions_by_reasons = {}
                if verify_file(selected_regions_bed_fpath, is_critical=False):
                    intersection_fpath = _intersect_with_tricky_regions(
                        cnf, selected_regions_bed_fpath, sample.name)
                    regions_by_reasons = _parse_intersection_with_tricky_regions(
                        cnf, intersection_fpath)
                total_report_fpath = add_suffix(
                    add_suffix(sample.flagged_tsv, region_type), coverage_type)
                if verify_file(total_report_fpath, is_critical=False):
                    with open(total_report_fpath) as f:
                        for l in f:
                            l = l.strip()
                            if not l or l.startswith('#'):
                                continue
                            fs = l.split('\t')
                            (chrom, start, end, size, gene, strand, feature,
                             biotype, min_depth, avg_depth) = fs[:10]
                            start, end = int(start), int(end)
                            regions_by_gene.setdefault(gene, [])
                            cur_region = Region(sample_name=[sample.name],
                                                avg_depth=[avg_depth],
                                                gene_name=gene,
                                                strand=strand,
                                                feature=feature,
                                                biotype=biotype,
                                                chrom=chrom,
                                                start=start,
                                                end=end)
                            for r in regions_by_reasons:
                                if r[0] <= start and end <= r[1]:
                                    cur_region.extra_fields = regions_by_reasons[
                                        r]
                            cur_region.missed_by_db = []
                            was_added = False
                            for r in regions_by_gene[gene]:
                                if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end:
                                    was_added = True
                                    if sample.name not in r.sample_name:
                                        r.sample_name.append(sample.name)
                                        r.avg_depth.append(avg_depth)
                            if not was_added:
                                regions_by_gene[gene].append(cur_region)
                report_fpath = join(
                    sample.flagged_regions_dirpath,
                    coverage_type + '_cov_' + region_type + '.oncomine.tsv')
                if verify_file(report_fpath, is_critical=False):
                    with open(report_fpath) as f:
                        for l in f:
                            l = l.strip()
                            if not l or l.startswith('#'):
                                continue
                            fs = l.split('\t')
                            hotspots = []
                            (gene, chrom, start, end, strand, feature, biotype,
                             id_, num_hotspots) = fs[:9]
                            start, end = int(start), int(end)
                            if int(num_hotspots) != 0:
                                hotspots = fs[9].split()

                            regions_by_gene.setdefault(gene, [])
                            cur_region = Region(sample_name=[sample.name],
                                                gene_name=gene,
                                                strand=strand,
                                                feature=feature,
                                                biotype=biotype,
                                                chrom=chrom,
                                                start=start,
                                                end=end)
                            for r in regions_by_gene[gene]:
                                if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end:
                                    if sample.name not in r.sample_name:
                                        r.sample_name.append(sample.name)
                                        r.avg_depth.append('.')
                                    new_hotspots = [
                                        hs for hs in hotspots
                                        if hs not in r.missed_by_db
                                    ]
                                    r.missed_by_db.extend(new_hotspots)
            flagged_regions_report = PerRegionSampleReport(
                name='Flagged regions',
                metric_storage=flagged_regions_metric_storage)
            num_regions = 0
            non_hs_class = ' no_hotspots'
            slash_with_zero_space = '/&#x200b;'
            for gene in regions_by_gene.keys():
                if regions_by_gene[gene]:
                    num_regions += len(regions_by_gene[gene])
                    row_class = ' expandable_row collapsed'
                    if len(regions_by_gene[gene]) > 1:
                        reg = flagged_regions_report.add_row()
                        reg.class_ = ' expandable_gene_row collapsed'
                        chr = regions_by_gene[gene][0].chrom
                        num_hotspots = [
                            len(r.missed_by_db) for r in regions_by_gene[gene]
                        ]
                        all_samples = [
                            sample for r in regions_by_gene[gene]
                            for sample in r.sample_name
                        ]
                        all_unique_samples = []
                        all_unique_samples = [
                            sample for sample in all_samples
                            if sample not in all_unique_samples
                            and not all_unique_samples.append(sample)
                        ]
                        all_tricky_regions = sorted(
                            set([
                                tricky_region for r in regions_by_gene[gene]
                                for tricky_region in r.extra_fields
                            ]))
                        all_depths = [[]
                                      for x in range(len(all_unique_samples))]
                        for r in regions_by_gene[gene]:
                            for sample_num, sample in enumerate(
                                    all_unique_samples):
                                if sample in r.sample_name:
                                    cur_sample_index = r.sample_name.index(
                                        sample)
                                    if r.avg_depth[cur_sample_index] != '.':
                                        all_depths[sample_num].append(
                                            float(
                                                r.avg_depth[cur_sample_index]))
                        avg_depth_per_samples = [
                            sum(all_depths[i]) /
                            len(all_depths[i]) if len(all_depths[i]) > 0 else 0
                            for i in range(len(all_depths))
                        ]
                        reg.add_record('Gene', gene)
                        reg.add_record('Chr', chr.replace('chr', ''))
                        reg.add_record('#HS', sum(num_hotspots))
                        reg.add_record(
                            'Position',
                            str(len(regions_by_gene[gene])) + ' regions')
                        reg.add_record(
                            'Ave depth',
                            slash_with_zero_space.join([
                                format(depth, '.2f') if depth != '.' else '.'
                                for depth in avg_depth_per_samples
                            ]),
                            num=sum(avg_depth_per_samples) /
                            len(avg_depth_per_samples))
                        reg.add_record('Hotspots & Deleterious', '')
                        reg.add_record('Possible reasons',
                                       ', '.join(all_tricky_regions))
                        reg.add_record('Samples',
                                       ',\n'.join(all_unique_samples))
                        reg.add_record('Found mutations', '')
                        if sum(num_hotspots) == 0:
                            reg.class_ += non_hs_class
                        row_class += ' row_to_hide row_hidden'
                    else:
                        row_class += ' not_to_hide'
                    for r in regions_by_gene[gene]:
                        reg = flagged_regions_report.add_row()
                        reg.class_ = row_class
                        reg.add_record('Gene', r.gene_name)
                        reg.add_record('Chr', r.chrom.replace('chr', ''))
                        avg_depths = [
                            float(depth) for depth in r.avg_depth
                            if depth != '.'
                        ]
                        reg.add_record(
                            'Ave depth',
                            slash_with_zero_space.join([
                                format(depth, '.2f') if depth != '.' else depth
                                for depth in avg_depths
                            ]),
                            num=sum(avg_depths) / len(avg_depths))
                        reg.add_record(
                            'Position',
                            Metric.format_value(
                                r.start, human_readable=True, is_html=True) +
                            '-' + Metric.format_value(
                                r.end, human_readable=True, is_html=True))
                        reg.add_record('#HS', len(r.missed_by_db))
                        if len(r.missed_by_db) == 0:
                            reg.class_ += non_hs_class
                        uniq_hs_positions = sorted(
                            set([
                                hotspot.split(':')[0]
                                for hotspot in r.missed_by_db
                            ]))
                        hs_by_pos = {
                            pos: [
                                h.split(':')[1] for h in r.missed_by_db
                                if h.split(':')[0] == pos
                            ]
                            for pos in uniq_hs_positions
                        }
                        hs_breakable = [
                            gray(
                                Metric.format_value(int(pos.replace(',', '')),
                                                    human_readable=True,
                                                    is_html=True)) + ': ' +
                            ','.join([
                                h.replace('/', slash_with_zero_space)
                                for h in hs_by_pos[pos]
                            ]) for pos in uniq_hs_positions
                        ]
                        reg.add_record('Hotspots & Deleterious',
                                       '\n'.join(hs_breakable))
                        reg.add_record('Possible reasons',
                                       ', '.join(r.extra_fields))
                        reg.add_record('Samples', ',\n'.join(r.sample_name))
                        found_mutations = []
                        for sample in samples:
                            if sample.name in r.sample_name:
                                for mut in mutations[sample.name]:
                                    if mut.gene.name == r.gene_name and r.start <= mut.pos <= r.end:
                                        found_mutations.append(
                                            gray(
                                                Metric.format_value(
                                                    mut.pos,
                                                    human_readable=True,
                                                    is_html=True)) + ':' +
                                            mut.ref + '>' + mut.alt + ' (' +
                                            sample.name + ')')
                        reg.add_record('Found mutations',
                                       '\n'.join(found_mutations))
            flagged_regions_report.expandable = True
            flagged_regions_report.unique = True
            regions_dict[coverage_type] = create_section(
                flagged_regions_report, num_regions, regions_by_gene.keys(),
                region_type)
            total_regions += num_regions
        flagged_report_fpath = join(flagged_regions_report_dirpath,
                                    'flagged_' + region_type + '.html')
        write_static_html_report(cnf, {
            'key_or_target': key_or_target_genes,
            'region_type': region_type,
            'genes_description': genes_description,
            'flagged_low': regions_dict['low'],
            'flagged_high': regions_dict['high'],
        },
                                 flagged_report_fpath,
                                 tmpl_fpath=join(
                                     dirname(abspath(__file__)),
                                     'template_flagged_regions.html'),
                                 extra_js_fpaths=[
                                     join(dirname(abspath(__file__)), 'static',
                                          'flagged_regions.js')
                                 ],
                                 extra_css_fpaths=[
                                     join(dirname(abspath(__file__)), 'static',
                                          'flagged_regions.css')
                                 ])
        #BaseReport.save_html(flagged_regions_report, cnf, flagged_report_fpath, caption='Flagged regions')
        info('')
        info('Flagged regions (total ' + str(total_regions) + ' ' +
             region_type + ') saved into:')
        info('  ' + flagged_report_fpath)
def create_oncoprints_link(cnf, bcbio_structure, project_name=None):
    if is_us(): loc = exposing.us
    # elif is_uk(): loc = exposing.uk
    else:
        loc = exposing.local
        return None

    if not bcbio_structure.variant_callers:
        info('No varianting calling performed, not generating Oncoprints')
        return None
    clinical_report_caller = \
        bcbio_structure.variant_callers.get('vardict') or \
        bcbio_structure.variant_callers.get('vardict-java')
    if not clinical_report_caller:
        err('Warning: vardict is not in the variant callers list, this not generating Oncoprints')
        return None

    step_greetings('Creating Oncoprints link')
    zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS'
    if not isdir(zhongwu_data_query_dirpath):
        warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.')
        return None

    vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name)
    vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname)
    cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix)

    cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath

    samples = sorted(bcbio_structure.samples)
    cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir)
    study_name = re.sub('[\.\-:&]', '_', cnf.project_name)

    check_genome_resources(cnf)

    data_query_dirpath = join(loc.dirpath, 'DataQueryTool')

    data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt')
    info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt')
    altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath)
    if not altered_genes:
        err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.')
        return None

    print_info_txt(cnf, samples, info_fpath)

    data_ext_fpath = data_fpath.replace('/home/', '/users/')
    info_ext_fpath = info_fpath.replace('/home/', '/users/')

    # optional:
    data_symlink = join(data_query_dirpath, study_name + '.data.txt')
    info_symlink = join(data_query_dirpath, study_name + '.info.txt')
    (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink)
    (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink)

    properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties')
    add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath)

    genes = '%0D%0A'.join(altered_genes)
    data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?'
        'analysis=oncoprint&'
        'study={study_name}&'
        'gene={genes}&'
        'order=on&'
        'freq=50&'
        'nocheckgenes=true&'
        'submit=Submit'
        .format(**locals()))

    info()
    info('Information about study was added in Data Query Tool, URL is ' + data_query_url)
    return data_query_url
Пример #27
0
def run_annotators(cnf, vcf_fpath, bam_fpath):
    original_vcf = cnf.vcf

    db_section_by_name = OrderedDict(
        (dbname, cnf.annotation[dbname])
        for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine']
        if dbname in cnf.annotation
        and not cnf.annotation[dbname].get('skip-annotation'))

    # if not cnf.no_check:
    #     to_delete_id_ref = []
    #     if 'dbsnp' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as rs*')
    #         to_delete_id_ref.append('rs')
    #     if 'cosmic' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as COS*')
    #         to_delete_id_ref.append('COS')
    #
    #     def delete_ids(rec):  # deleting existing dbsnp and cosmic ID annotations
    #         if rec.ID:
    #             if isinstance(rec.ID, basestring):
    #                 if any(rec.ID.startswith(pref) for pref in to_delete_id_ref):
    #                     rec.ID = None
    #             else:
    #                 rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)]
    #
    #         if not rec.FILTER:
    #             rec.FILTER = 'PASS'
    #
    #         return rec
    #
    #     info('Removing previous rs* and COS* IDs')
    #     vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID')

    bcftools = get_system_path(cnf, 'bcftools')

    if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'):
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    cmdl = '{bcftools} annotate --remove ID {vcf_fpath}'
    res = call(cnf,
               cmdl.format(**locals()),
               output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid'))
    if res:
        vcf_fpath = res
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get(
            'custom_vcfs', dict()).items():
        step_greetings('Annotating using ' + dbname)
        annotations = ','.join('INFO/' + a for a in dbconf.get('annotations'))
        if dbname in ('cosmic', 'dbsnp'):
            annotations += ',=ID'
        db_fpath = get_db_path(cnf, dbconf, dbname)
        if db_fpath:
            cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}'
            res = call(cnf,
                       cmdl.format(**locals()),
                       output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname))
            if res:
                vcf_fpath = res
                vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    verify_vcf(vcf_fpath, is_critical=True)

    if 'dbnsfp' in cnf.annotation:
        res = _snpsift_db_nsfp(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    if 'snpeff' in cnf.annotation:
        res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath)
        if res:
            vcf_fpath = res
            verify_vcf(vcf_fpath, is_critical=True)
            final_summary_fpath = join(cnf.output_dir, basename(summary_fpath))
            final_genes_fpath = join(cnf.output_dir, basename(genes_fpath))
            if isfile(final_summary_fpath): os.remove(final_summary_fpath)
            if isfile(final_genes_fpath): os.remove(final_genes_fpath)
            if file_exists(summary_fpath):
                shutil.move(summary_fpath, final_summary_fpath)
            if file_exists(genes_fpath):
                shutil.move(genes_fpath, final_genes_fpath)

    if 'tracks' in cnf.annotation and cnf.annotation[
            'tracks'] and cnf.annotation['tracks']:
        track_fapths = []
        for track_name in cnf.annotation['tracks']:
            if isfile(track_name) and verify_file(track_name):
                track_fapths.append(track_name)
            else:
                if 'tracks' in cnf['genome'] and cnf['genome'][
                        'tracks'] and track_name in cnf['genome']['tracks']:
                    track_fpath = cnf['genome']['tracks'][track_name]
                    if verify_file(track_fpath):
                        track_fapths.append(track_fpath)
        for track_fapth in track_fapths:
            res = _tracks(cnf, track_fapth, vcf_fpath)
            if res:
                vcf_fpath = res

    step_greetings('Intersection with database VCFs...')
    if 'intersect_with' in cnf.annotation:
        for key, db_fpath in cnf.annotation['intersect_with'].items():
            res = intersect_vcf(cnf,
                                input_fpath=vcf_fpath,
                                db_fpath=db_fpath,
                                key=key)
            if res:
                vcf_fpath = res

    if 'mongo' in cnf.annotation:
        res = _mongo(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    return vcf_fpath