class AlgorithmInvocation(object):
    ''' transaction management for algorithm invocation '''
    def __init__(self,
                 script=None,
                 parameters=None,
                 commit=False,
                 gusConfigFile=None):
        self._database = None
        self._algorithm_invocation_id = None

        self.__get_db_handle(gusConfigFile)
        if script is not None:
            self.insertAlgorithmInvocation(script, parameters, commit)

    def __get_db_handle(self, gusConfigFile):
        ''' create database connection '''
        self._database = Database(gusConfigFile)
        self._database.connect()

    def insertAlgorithmInvocation(self, script, parameters, commit):
        ''' create the entry for the algorithm invocation '''
        sql = """INSERT INTO AlgorithmInvocation
(script_name, script_parameters, commit_mode) VALUES (%s, %s, %s)
RETURNING algorithm_invocation_id"""
        with self._database.cursor() as cursor:
            cursor.execute(sql, (script, parameters, commit))
            self._algorithm_invocation_id = cursor.fetchone()[0]

        self._database.commit()

    def getAlgorithmInvocationId(self):
        ''' return algorithm invocation id '''
        return self._algorithm_invocation_id

    def close(self):
        ''' close db connection '''
        self._database.close()
示例#2
0
def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()
def load_annotation():
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    variantCount = 0
    with database.cursor() as cursor:
        with open(args.vcfFile, 'r') as fh:
            with open(args.logFileName, 'w') as lfh:
                warning("Parsing", args.vcfFile, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = {} # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt') # assuming not multiallelic

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))


                    # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                    isMultiAllelic = False # assuming b/c of how the VCF files were generated
                    metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                    try:
                        if variant_is_missing(database, 'chr' + chrom, metaseqId):
                            warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True)
                            variantCount = variantCount + 1

                            positionEnd = entry.infer_variant_end_location(altAllele)
                            binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                            cursor.execute(INSERT_SQL, 
                                               ('chr' + chrom,
                                                position,
                                                isMultiAllelic,
                                                binIndex,
                                                metaseqId,
                                                json.dumps(vepResult),
                                                algInvocId))

                            if args.commit:
                                database.commit()
                            else:
                                database.rollback()

                        if lineCount % 50 == 0:
                            warning("Parsed", lineCount, "lines.", file=lfh, flush=True)
                            warning("Loaded", variantCount, "missing variants", file=lfh, flush=True)

                    except Exception:
                        warning("ERROR parsing variant", metaseqId, file=lfh, flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
    database.close()
    indexer.close()
def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)
示例#5
0
def update_variant_records_from_vcf():
    ''' lookup and update variant records from a VCF file 
    assuming the load by file was called by a plugin, so this variant has already been
    verified to be new to the resource; no need to check alternative metaseq IDs'''

    cupdater = CADDUpdater(args.logFile, args.databaseDir)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    with database.cursor() as updateCursor, \
      open(args.vcfFile, 'r') as fh:
        try:
            for line in fh:
                if line.startswith("#"):
                    continue

                lineCount = lineCount + 1
                entry = VcfEntryParser(line.rstrip())

                refAllele = entry.get('ref')
                altAllele = entry.get('alt')
                chrom = xstr(entry.get('chrom'))
                if chrom == 'MT':
                    chrom = 'M'
                position = int(entry.get('pos'))
                metaseqId = ':'.join(
                    (chrom, xstr(position), refAllele, altAllele))

                record = {"metaseq_id": metaseqId}  # mimic "record"

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if lineCount % args.commitAfter == 0:
                    warning("Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

                    updateCursor.execute(cupdater.sql_buffer_str())

                    if args.commit:
                        database.commit()
                    else:
                        database.rollback()
                    cupdater.clear_update_sql()

            if cupdater.buffered_variant_count() > 0:  # trailing
                updateCursor.execute(cupdater.sql_buffer_str())

                if args.commit:
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)

            cupdater.close_lfh()

        except Exception as e:
            warning(e, entry, file=cupdater.lfh(), flush=True)
            database.rollback()
            database.close()
            print("FAIL", file=sys.stdout)
            raise
示例#6
0
def update_variant_records(chromosome):
    chrLabel = 'chr' + xstr(chromosome)

    logFileName = path.join(args.logFilePath, chrLabel + '.log')
    cupdater = CADDUpdater(logFileName, args.databaseDir)
    cupdater.setChrm(chrLabel)

    selectSQL = "SELECT metaseq_id, cadd_scores FROM Variant_" + chrLabel

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    updateCount = 0
    updateIndelCount = 0
    skipCount = 0

    with database.cursor("RealDictCursor") as selectCursor, \
        database.cursor() as updateCursor:
        try:
            warning("Fetching",
                    chrLabel,
                    "variants",
                    file=cupdater.lfh(),
                    flush=True)
            selectCursor.execute(selectSQL)
            warning("DONE - Fetching", file=cupdater.lfh(), flush=True)

            for record in selectCursor:
                if args.debug and args.veryVerbose:
                    warning(record, file=cupdater.lfh(), flush=True)

                if record['cadd_scores'] is not None:
                    if args.debug and args.veryVerbose:
                        warning("Skipping",
                                record['metaseq_id'],
                                file=cupdater.lfh(),
                                flush=True)
                    skipCount = skipCount + 1
                    continue

                lineCount = lineCount + 1

                metaseqId = record['metaseq_id']

                chrm, position, refAllele, altAllele = metaseqId.split(':')

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if cupdater.get_total_update_count(
                ) % args.commitAfter == 0 and cupdater.buffered_variant_count(
                ) > 0:
                    if args.commit:
                        if args.debug:
                            warning("Starting Update",
                                    file=cupdater.lfh(),
                                    flush=True)

                        updateCursor.execute(cupdater.sql_buffer_str())

                        if args.debug:
                            warning("Done", file=cupdater.lfh(), flush=True)

                        cupdater.clear_update_sql()
                        database.commit()
                    else:
                        database.rollback()

                    warning(metaseqId,
                            "- Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            "- Skipped:",
                            skipCount,
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

            if cupdater.buffered_variant_count() > 0:
                updateCursor.execute(cupdater.sql_buffer_str())
                if args.commit:  # trailing
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Skipped",
                    skipCount,
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)
            cupdater.close_lfh()

        except Exception as e:
            warning(e, file=cupdater.lfh(), flush=True)
            if args.commit:
                database.commit()
            else:
                database.rollback()
            database.close()
            raise

    database.close()
        help=
        "full path file containing mapping of chr names to length; tab-delim, no header",
        required=True)
    parser.add_argument('--commit',
                        action='store_true',
                        help="run in commit mode",
                        required=False)
    parser.add_argument(
        '--gusConfigFile',
        '--full path to gus config file, else assumes $GUS_HOME/config/gus.config'
    )
    args = parser.parse_args()

    increments = [
        -1, 64000000, 32000000, 16000000, 8000000, 4000000, 2000000, 1000000,
        500000, 250000, 125000, 62500, 31250, 15625
    ]
    binCount = 0
    numLevels = len(increments)

    chrMap = read_chr_map()
    insertSql = "INSERT INTO BinIndexRef (chromosome, level, global_bin, global_bin_path, location) VALUES (%s, %s, %s, %s, %s)"

    database = Database(args.gusConfigFile)
    database.connect()

    cursor = database.cursor()
    load_bins()

    database.close()