def igvtools_index(cnf, vcf_fpath):
    igvtools = get_system_path(cnf, 'igvtools')
    if not igvtools:
        err('Warning: no igvtools found, cannot index VCF.')
        return None
    if igvtools.endswith('.jar'):
        igvtools = get_java_tool_cmdline(cnf, 'igvtools')
        if igvtools is None:
            err('Warning: no jar igvtools found, cannot index VCF.')
            return None

    cmdline = '{igvtools} index {vcf_fpath}'.format(**locals())
    call(cnf, cmdline, exit_on_error=False)
    if exists('igv.log'):
        try:
            os.remove('igv.log')
        except OSError:
            pass
    return vcf_fpath + '.idx'
예제 #2
0
def combine_vcfs(cnf,
                 vcf_fpath_by_sname,
                 combined_vcf_fpath,
                 additional_parameters=''):
    gatk = get_java_tool_cmdline(cnf, 'gatk')
    if not gatk:
        info('GATK is not found, skipping merging VCFs')
        return None

    cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format(
        **locals())
    for s_name, vcf_fpath in vcf_fpath_by_sname.items():
        if vcf_fpath:
            cmdl += ' --variant:' + s_name + ' ' + vcf_fpath
    if ' --variant:' not in cmdl:
        err('No VCFs to combine')
        return None

    if cnf.reuse_intermediate and isfile(
            combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath +
                                                       '.gz'):
        info(combined_vcf_fpath + '.gz exists, reusing')
        return combined_vcf_fpath + '.gz'

    cmdl += ' -o ' + combined_vcf_fpath
    res = call(cnf,
               cmdl,
               output_fpath=combined_vcf_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        info('Joined VCFs, saved into ' + combined_vcf_fpath)
        if isfile(combined_vcf_fpath + '.tx.idx'):
            try:
                os.remove(combined_vcf_fpath + '.tx.idx')
            except OSError:
                err(traceback.format_exc())
                info()
        return bgzip_and_tabix(cnf, combined_vcf_fpath)
    else:
        warn('Could not join VCFs')
        return None
def get_trasncripts_fpath(cnf):
    if cnf.transcripts_fpath:
        if verify_file(cnf.transcripts_fpath):
            return cnf.transcripts_fpath

        if isfile(cnf.transcripts_fpath):
            os.remove(cnf.transcripts_fpath)

    # custom_transcripts_fpath = cnf['snpeff'].get('only_transcripts')
    # if custom_transcripts_fpath:
    #     if verify_file(custom_transcripts_fpath, 'Transcripts for snpEff -onlyTr'):
    #         transcripts_fpath = custom_transcripts_fpath
    #
    # else:

    dump_transcript_fpath = join(cnf.work_dir, 'snpeff_transcripts.txt')
    if isfile(dump_transcript_fpath) and verify_file(dump_transcript_fpath):
        cnf.transcripts_fpath = dump_transcript_fpath
        return cnf.transcripts_fpath

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')
    if not snpeff:
        critical('No snpeff or it is incorrect path in system config.')
    db_path = cnf['genome'].get('snpeff')
    if db_path:
        db_path_cmdline = ' -dataDir ' + db_path
    else:
        db_path_cmdline = ''
        # err('Please, provide a path to SnpEff data in '
        #     'the "genomes" section in the system config.')
    if isfile(dump_transcript_fpath):
        os.remove(dump_transcript_fpath)
    genome = cnf.genome.name
    cmdline = '{snpeff} dump {db_path_cmdline} -v -txt {genome}'.format(**locals())
    if call(cnf, cmdline, output_fpath=dump_transcript_fpath):
        cnf.transcripts_fpath = dump_transcript_fpath

    return cnf.transcripts_fpath
예제 #4
0
def _snpsift_db_nsfp(cnf, input_fpath):
    if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome:
        return None

    step_greetings('DB SNFP')

    output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp')
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')

    db_path = cnf['genome']['dbnsfp']
    if not verify_file(db_path, 'DB NSFP file'):
        err('DB NSFP file is incorrect. Skipping.')
        return None

    annotations = cnf.annotation['dbnsfp'].get('annotations') or []

    # all_fields.extend(['dbNSFP_' + ann for ann in annotations])

    ann_line = ('-f ' + ','.join(annotations)) if annotations else ''

    cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \
              '{input_fpath}'.format(**locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=True,
                       exit_on_error=False,
                       overwrite=True):
        return verify_vcf(output_fpath, is_critical=True)
    else:
        return None
예제 #5
0
def _mongo(cnf, input_fpath):
    step_greetings('Annotating from Mongo')

    if 'mongo' not in cnf.annotation:
        return None

    executable = get_java_tool_cmdline(
        cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar'))
    output_fpath = intermediate_fname(cnf, input_fpath, 'mongo')
    project_name = cnf.project_name

    cmdline = ('{executable} -module annotation -inputFile {input_fpath} '
               ''
               '-outputFile {output_fpath} -project {project_name} ').format(
                   **locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=False,
                       exit_on_error=False):
        return output_fpath
    else:
        return None
예제 #6
0
def _snpeff(cnf, input_fpath):
    if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome:
        return None, None, None

    step_greetings('SnpEff')

    output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff')
    stats_fpath = join(
        cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') +
        '.snpEff_summary.csv')

    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')

    ref_name = cnf.genome.snpeff.reference or cnf.genome.name
    if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'):
        ref_name = 'GRCh37.75'
    if ref_name.startswith('hg38'): ref_name = 'GRCh38.82'

    opts = ''
    if cnf.annotation.snpeff.cancer: opts += ' -cancer'

    assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!'
    verify_file(cnf.transcripts_fpath,
                'Transcripts for snpEff -onlyTr',
                is_critical=True)
    opts += ' -onlyTr ' + cnf.transcripts_fpath + ' '

    db_path = adjust_system_path(cnf.genome.snpeff.data)
    if db_path:
        opts += ' -dataDir ' + db_path
    elif cnf.resources.snpeff.config:
        conf = get_system_path(cnf, cnf.resources.snpeff.config)
        if conf:
            opts += ' -c ' + conf + ' '
        else:
            err('Cannot find snpEff config file ' +
                str(cnf.resources.snpeff.config))

    if cnf.annotation.snpeff.extra_options:
        opts += ''

    if not cnf.no_check:
        info('Removing previous snpEff annotations...')
        res = remove_prev_eff_annotation(cnf, input_fpath)
        if not res:
            err('Could not remove preivous snpEff annotations')
            return None, None, None
        input_fpath = res

    snpeff_type = get_snpeff_type(snpeff)
    if snpeff_type == "old":
        opts += ' -stats ' + stats_fpath + ' -csvStats'
    else:
        opts += ' -csvStats ' + stats_fpath

    cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format(
        **locals())

    for i in range(1, 20):
        try:
            res = call_subprocess(cnf,
                                  cmdline,
                                  input_fpath,
                                  output_fpath,
                                  exit_on_error=False,
                                  stdout_to_outputfile=True,
                                  overwrite=True)
        except OSError:
            import traceback, time
            err(traceback.format_exc())
            warn()
            info('Waiting 1 minute')
            time.sleep(60)
            info('Rerunning ' + str(i))
        else:
            break

    output_fpath = verify_vcf(output_fpath, is_critical=True)

    snpeff_summary_html_fpath = 'snpEff_summary.html'
    if isfile(snpeff_summary_html_fpath):
        info('SnpEff created ' + snpeff_summary_html_fpath +
             ' in the cwd, removing it...')
        try:
            os.remove(snpeff_summary_html_fpath)
        except OSError:
            pass

    if res:
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'
    else:
        return None, None, None
예제 #7
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)