def leave_main_sample(cnf, vcf_fpath, samplename):
    index = get_sample_column_index(vcf_fpath, samplename)
    if index is None:
        return vcf_fpath

    # def _f1(rec):
    #     rec.samples = [sample_name]
    #     return rec
    #
    info('Keeping SAMPLE only for the first sample (' + samplename + ')')
    # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name)
    # out_fpath = extract_sample(cnf, vcf_fpath, sample_name)
    # info()

    def _f(line, i):
        if line and (line.startswith('#CHROM') or line[0] != '#'):
            ts = line.split('\t')
            return '\t'.join(ts[:9] + [ts[9 + index]])
        return line
    vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm')

    if not verify_file(vcf_fpath):
        err('Error: leave_first_sample didnt generate output file.')
        return None

    return vcf_fpath
def remove_prev_eff_annotation(cnf, input_fpath):
    fields_to_del = ['EFF', 'ANN']

    def proc_line(l, i):
        if l.startswith('##SnpEff'):
            return None

        elif any(f in l for f in fields_to_del):
            if l.startswith('##INFO='):
                try:
                    if l.split('=', 1)[1].split(',', 1)[0].split('=')[1] in fields_to_del:
                        return None
                except IndexError:
                    critical('Incorrect VCF at line: ' + l)

            elif not l.startswith('#'):
                fields = l.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = filter(lambda pair: pair[0] not in fields_to_del, info_pairs)
                info_line = ';'.join('='.join(pair) if len(pair) == 2 and pair[0] not in fields_to_del
                                     else pair[0] for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return l

    return iterate_file(cnf, input_fpath, proc_line, suffix='noEFF')
Exemplo n.º 3
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
Exemplo n.º 4
0
def remove_comments(cnf, bed_fpath):
    def f(l, i):
        if not l.startswith('#'):
            return l
        else:
            return None

    return iterate_file(cnf, bed_fpath, f, suffix='rmcmt')
Exemplo n.º 5
0
def fix_vcf_sample_name(cnf, sample_name, vcf_fpath, output_fpath=None):
    output_fpath = output_fpath or intermediate_fname(cnf, vcf_fpath, 'sample')
    def fix_sample_name(l, i):
        if l.startswith('#CHROM'):
            fs = l.split('\t')
            fs[9] = sample_name
            l = '\t'.join(fs)
        elif not l.startswith('#'):
            fs = l.split('\t')
            kvs = fs[7].split(';')
            for i, kv in enumerate(kvs[:]):
                if kv.startswith('SAMPLE='):
                    kvs[i] = 'SAMPLE=' + sample_name
            l = '\t'.join(fs[:7]) + '\t' + ';'.join(kvs) + '\t' + '\t'.join(fs[8:])
            # l = re.sub("(?<=SAMPLE=)[^;](?=;)", sample_name, l)
        return l
    fixed_vcf = iterate_file(cnf, vcf_fpath, fix_sample_name, output_fpath=output_fpath)
    return bgzip_and_tabix(cnf, fixed_vcf)
Exemplo n.º 6
0
def _filter_malformed_fields(cnf, input_fpath):
    step_greetings('Correcting malformed fields...')

    def proc_rec(rec):
        for k, v in rec.INFO.items():
            if isinstance(v, list):
                if v[-1] == '.':
                    rec.INFO[k] = rec.INFO[k][:-1]
                if v[0] == '.':
                    rec.INFO[k] = rec.INFO[k][1:]
        return rec

    def proc_line(line, i):
        if line.startswith('#'):
            return line.replace("\' \">", "\'\">")  # For vcf-merge
        return line

        # else:
        # if ',.' in line or '.,' in line:
        #     fields = line.split('\t')
        #     info_line = fields[7]
        #     info_pairs = [attr.split('=') for attr in info_line.split(';')]
        #     new_info_pairs = []
        #     for p in info_pairs:
        #         if len(p) == 2:
        #             if p[1].endswith(',.'):
        #                 p[1] = p[1][:-2]
        #             if p[1].startswith('.,'):
        #                 p[1] = p[1][2:]
        #             new_info_pairs.append('='.join(p))
        #     info_line = ';'.join(new_info_pairs)
        #     fields = fields[:7] + [info_line] + fields[8:]
        #     return '\t'.join(fields)

    info('Correcting INFO fields...')
    output_fpath = iterate_vcf(cnf, input_fpath, proc_rec, suffix='corr')
    info('')
    info('Correcting headers for vcf-merge...')
    output_fpath = iterate_file(cnf,
                                output_fpath,
                                proc_line,
                                suffix='corr_headr')

    return output_fpath
Exemplo n.º 7
0
def filter_bed_with_gene_set(cnf, bed_fpath, gene_keys_set, suffix=None):
    def fn(l, i):
        if l:
            fs = l.split('\t')
            if len(fs) < 4:
                return None
            new_gns = []
            c = fs[0]
            for g in fs[3].split(','):
                if (g, c) in gene_keys_set:
                    new_gns.append(g)
            if new_gns:
                return l.replace(fs[3], ','.join(new_gns))

    return iterate_file(cnf,
                        bed_fpath,
                        fn,
                        suffix=suffix or 'filt_genes',
                        check_result=False)
Exemplo n.º 8
0
def add_annotation(cnf, input_fpath, key, value, number, type_, description):
    step_greetings('Adding annotation...')

    def proc_rec(rec):
        rec.INFO[key] = value
        return rec

    output_fpath = iterate_vcf(cnf, input_fpath, proc_rec)

    info('Adding header meta info...')

    def _add_format_header(l, i):
        if l.startswith('#CHROM'):
            ext_l = ''
            ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format(
                key=key, number=number, type_=type_, desc=description)
            return ext_l + l
        return l

    output_fpath = iterate_file(cnf, output_fpath, _add_format_header)
    return verify_vcf(output_fpath, is_critical=True)
def remove_rejected(cnf, input_fpath, output_fpath=None):
    # if not input_fpath.endswith('.gz') or not file_exists(input_fpath + '.tbi'):
    #     input_fpath = bgzip_and_tabix(cnf, input_fpath)

    qual_threshold = _get_qual_threshold(input_fpath)
    info('VCF QUAL threshold is ' + str(qual_threshold))
    if qual_threshold > cnf.variant_filtering.min_q_mean:
        info('Requested QUAL threshold is ' + str(cnf.variant_filtering.min_q_mean) +
             ', which is higher than in VCF, so keeping records with FILTER=q' + str(qual_threshold))

    def fn(l, i):
        if l.startswith('#'):
            return l
        else:
            fs = l.split('\t')
            if fs[6] == 'q' + str(qual_threshold) and qual_threshold > cnf.variant_filtering.min_q_mean:
                fs[6] = 'PASS'
            if fs[6] == 'PASS':
                return l
            else:
                return None
    return iterate_file(cnf, input_fpath, fn, suffix='pass')
Exemplo n.º 10
0
def prep_bed_for_seq2c(cnf, bed_fpath):
    info()
    info('Doing some Seq2C specific preparation of the BED file...')

    cols = count_bed_cols(bed_fpath)

    seq2c_bed = None
    if 8 > cols > 4:
        seq2c_bed = cut(cnf, bed_fpath, 4)
    elif cols > 8:
        seq2c_bed = cut(cnf, bed_fpath, 8)
    else:
        seq2c_bed = bed_fpath

    if cols >= 4:
        # removing regions with no gene annotation
        def f(l, i):
            if l.split('\t')[3].strip() == '.': return None
            else: return l

        seq2c_bed = iterate_file(cnf, seq2c_bed, f, suffix='filt')

    info('Done: ' + seq2c_bed)
    return seq2c_bed
Exemplo n.º 11
0
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None):
    if features_bed is None and target_bed is None:
        warn(
            'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.'
        )
        # return None, None, None, None

    if target_bed:
        target_bed = verify_bed(target_bed, is_critical=True)

    if seq2c_bed:
        seq2c_bed = verify_bed(seq2c_bed, is_critical=True)

    if features_bed:
        features_bed = verify_bed(features_bed, is_critical=True)

    # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed):
    #     warn('Same file used for exons and amplicons: ' + features_bed)

    # Features
    features_no_genes_bed = None
    if features_bed:
        # info()
        # info('Merging regions within genes...')
        # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True)
        #
        # info()
        # info('Sorting exons by (chrom, gene name, start)')
        # exons_bed = sort_bed(cnf, exons_bed)

        info()
        info(
            'Filtering the features bed file to have only non-gene and no-transcript records...'
        )
        features_no_genes_bed = intermediate_fname(cnf, features_bed,
                                                   'no_genes')
        call(cnf,
             'grep -vw Gene ' + features_bed + ' | grep -vw Transcript',
             output_fpath=features_no_genes_bed)

    ori_target_bed_path = target_bed
    if target_bed:
        info()
        info('Remove comments in target...')
        target_bed = remove_comments(cnf, target_bed)

        info()
        info('Cut -f1,2,3,4 target...')
        target_bed = cut(cnf, target_bed, 4)

        info()
        info('Sorting target...')
        target_bed = sort_bed(cnf, target_bed)

        cols = count_bed_cols(target_bed)
        if cnf.reannotate or cols < 4:
            info()
            if not features_bed:
                critical(
                    str(cols) +
                    ' columns (less than 4), and no features to annotate regions '
                    '(please make sure you have set the "features" key in the corresponding genome section '
                    '(' + cnf.genome.name + ') in ' + cnf.sys_cnf)
            info(
                'cnf.reannotate is ' + str(cnf.reannotate) +
                ', and cols in the target BED is ' + str(cols) +
                '. Annotating target with the gene names from the "features" file '
                + features_bed + '...')
            target_bed = annotate_target(cnf, target_bed)

    def remove_no_anno(l, i):
        if l.split('\t')[3].strip() == '.': return None
        else: return l

    if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path:
        info('Seq2C bed: remove regions with no gene annotation')
        seq2c_bed = target_bed
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    elif seq2c_bed:
        info()
        info('Remove comments in seq2c bed...')
        seq2c_bed = remove_comments(cnf, seq2c_bed)

        info()
        info('Sorting seq2c bed...')
        seq2c_bed = sort_bed(cnf, seq2c_bed)

        cols = count_bed_cols(seq2c_bed)
        if cols < 4:
            info()
            info('Number columns in SV bed is ' + str(cols) +
                 '. Annotating amplicons with gene names...')
            seq2c_bed = annotate_target(cnf, seq2c_bed)
        elif 8 > cols > 4:
            seq2c_bed = cut(cnf, seq2c_bed, 4)
        elif cols > 8:
            seq2c_bed = cut(cnf, seq2c_bed, 8)
        info('Filtering non-annotated entries in seq2c bed')
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    else:
        seq2c_bed = verify_bed(cnf.genome.cds)

    if target_bed:
        info()
        # info('Merging amplicons...')
        # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False)

        info('Sorting target by (chrom, gene name, start)')
        target_bed = sort_bed(cnf, target_bed)

    return features_bed, features_no_genes_bed, target_bed, seq2c_bed
Exemplo n.º 12
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)
Exemplo n.º 13
0
def intersect_vcf(cnf, input_fpath, db_fpath, key):
    vcf_fpath = input_fpath

    db_fpath = verify_file(db_fpath)
    if not db_fpath:
        return None

    info('Intersecting with ' + db_fpath + ', writing key ' + str(key))

    info('Preparing db...')

    def _add_info_flag(l, i):
        if l.startswith('#'):
            return l
        fs = l.split('\t')
        info_col, ft_keys, ft_vals = fs[-3], fs[-2], fs[-1]
        ft_dict = dict(zip(ft_keys.split(':'), ft_vals.split(':')))
        for ann in ['DP', 'MQ']:
            val = ft_dict.get(ann, None)
            if val:
                # ft_keys[key.replace('.', '_') + '_' + ann] = val
                # del ft_keys[ann]
                info_col += ';' + key.replace('.', '_') + '_' + ann + '=' + val
        # ft_items = ft_dict.items()
        # ft_keys = [k for k, v in ft_items]
        # ft_vals = [v for k, v in ft_items]
        # return '\t'.join(fs[:-2]) + '\t' + ':'.join(ft_keys) + '\t' + ':'.join(ft_vals)
        return '\t'.join(fs[:-3]) + '\t' + info_col + '\t' + '\t'.join(fs[-2:])
        # rec.FORMAT[key.replace('.', '_') + '_DP'] = rec.genotype(key.split('.')[0])['DP']
        # rec.INFO[key.replace('.', '_') + '_MQ'] = rec.genotype(key.split('.')[0])['MQ']
        # return rec

    # db_fpath = iterate_vcf(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS')
    db_fpath = iterate_file(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS')

    info('Adding header meta info...')

    def _add_header(l, i):
        if l.startswith('#CHROM'):
            ext_l = ''
            for ann in ['DP', 'MQ']:
                ext_l += '##INFO=<ID=' + key.replace(
                    '.', '_'
                ) + '_' + ann + ',Number=1,Type=Integer,Description="description">\n'
            return ext_l + l
        return l

    db_fpath = iterate_file(cnf, db_fpath, _add_header, suffix='INFO_HEADER')

    # out_fpath = add_suffix(db_fpath, 'HEADERS')
    # if cnf.reuse_intermediate and verify_file(out_fpath, silent=True):
    #     info(out_fpath + ' exists, reusing')
    # else:
    #     reader = vcf_parser.Reader(open(db_fpath))
    #     for k in 'DP', 'MQ':
    #         k = k + '_' + key.replace('.', '_')
    #         reader.infos[k] = _Info(id=k, num=1, type='Integer', desc=k + ' ' + key)
    #
    #     with file_transaction(cnf.work_dir, out_fpath) as tx:
    #         recs = []
    #         cnt = 0
    #         with open(tx, 'w') as f:
    #             writer = vcf_parser.Writer(f, reader)
    #             while True:
    #                 cnt += 1
    #                 rec = next(reader, None)
    #                 if rec is None:
    #                     break
    #                 recs.append(rec)
    #                 if cnt % 1000000 == 0:
    #                     info('Written ' + str(cnt) + ' lines')
    #                     writer.write_records(recs)
    #                     recs = []
    #             writer.write_records(recs)
    #     db_fpath = out_fpath
    db_fpath = bgzip_and_tabix(cnf, db_fpath)

    info('Annotating using this db...')
    vcf_conf = {
        'path':
        db_fpath,
        'annotations':
        [key.replace('.', '_') + '_DP',
         key.replace('.', '_') + '_MQ']
    }
    vcf_fpath = _snpsift_annotate(cnf, vcf_conf, key, vcf_fpath)

    info('Moving INFO to FORMAT...')

    def _move_info_to_format(l, i):
        if l.startswith('#'):
            return l
        fs = l.split('\t')
        info_dict = dict([
            kv.split('=') if '=' in kv else (kv, True)
            for kv in fs[7].split(';')
        ])
        ft_keys = fs[8].split(':')
        all_ft_vals = [ft_vals.split(':') for ft_vals in fs[9:]]
        ft_dicts = [
            OrderedDict(zip(ft_keys, ft_vals)) for ft_vals in all_ft_vals
        ]
        for ann in ['DP', 'MQ']:
            k = key.replace('.', '_') + '_' + ann
            for ft_dict in ft_dicts:
                ft_dict[k] = info_dict.get(k, '.')
        all_ft_vals = []
        for ft_dict in ft_dicts:
            ft_items = ft_dict.items()
            ft_keys = [k for k, v in ft_items]
            all_ft_vals.append([v for k, v in ft_items])
        l = '\t'.join(fs[:8]) + '\t' + ':'.join(ft_keys)
        for ft_vals in all_ft_vals:
            l += '\t' + ':'.join(ft_vals)
        return l
        # rec.FORMAT[key.replace('.', '_') + '_DP'] = rec.genotype(key.split('.')[0])['DP']
        # rec.INFO[key.replace('.', '_') + '_MQ'] = rec.genotype(key.split('.')[0])['MQ']
        # return rec

    # db_fpath = iterate_vcf(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS')
    vcf_fpath = iterate_file(cnf,
                             vcf_fpath,
                             _move_info_to_format,
                             suffix='FORMAT_FLAGS')

    info('Adding FORMAT header meta info...')

    def _add_format_header(l, i):
        if l.startswith('#CHROM'):
            ext_l = ''
            ext_l += '##FORMAT=<ID=' + key.replace(
                '.', '_'
            ) + '_DP,Number=1,Type=Integer,Description="Number of high-quality bases">\n'
            ext_l += '##FORMAT=<ID=' + key.replace(
                '.', '_'
            ) + '_MQ,Number=1,Type=Integer,Description="Average mapping quality">\n'
            return ext_l + l
        return l

    vcf_fpath = iterate_file(cnf,
                             vcf_fpath,
                             _add_format_header,
                             suffix='FORMAT_HEADER')

    info()
    if vcf_fpath:
        info('Renaming ' + vcf_fpath + ' -> ' + input_fpath)
        os.rename(vcf_fpath, input_fpath)
    else:
        warn('Intersection with ' + key + ' didn\'t work')
    return input_fpath