Python VcfEntryParser.get_refsnp示例

编程语言: Python

命名空间/包名称: AnnotatedVDB.Util.vcf_parser

类/类型: VcfEntryParser

方法/功能: get_refsnp

hotexamples.com的示例: 2

Python VcfEntryParser.get_refsnp - 已找到2个示例。这些是从开源项目中提取的最受好评的AnnotatedVDB.Util.vcf_parser.VcfEntryParser.get_refsnp现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

VcfEntryParser(6)

get(5)

get_entry(4)

infer_variant_end_location(4)

get_refsnp(2)

normalize_alleles(2)

示例#1

显示文件

def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()

示例#2

显示文件

文件： load_missing_dbsnp_from_vcf.py 项目： NIAGADS/AnnotatedVDB

def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)