예제 #1
0
def run():
    import argparse

    parser = argparse.ArgumentParser(usage=__usage__)

    # Necessary arguments
    parser.add_argument("-v", "--vcf", required=True)
    parser.add_argument("-f", "--fam", required=True)
    # Optional arguments
    parser.add_argument("-g",
                        "--gen",
                        required=False,
                        default="hg38",
                        choices=["hg19", "hg38"])
    parser.add_argument("-i", "--info", required=False)
    args = parser.parse_args()

    vcf_filepath = args.vcf
    fam_filepath = args.fam
    gen = args.gen

    info_keys = []
    if args.info:
        f = open(args.info, "r")
        for line in f:
            info_keys.append(line.rstrip())
    else:
        info_keys = [
            "VQSLOD", "ClippingRankSum", "BaseQRankSum", "FS", "SOR", "MQ",
            "MQRankSum", "QD", "ReadPosRankSum"
        ]

    from vcf import parse
    parse(vcf_filepath, fam_filepath, info_keys=info_keys)
예제 #2
0
def vcf_file(filepath=None, input=None, delim=",", quote='"'):
    if filepath is not None:
        input = open(filepath, 'rb')
    csv_input = csv.reader(input, delimiter=delim, quotechar=quote)

    # for each vc_group
    for row in csv_input:

        row = [None if f == '' else f for f in row] 

        info = vcf.parse('info', row[35 - 1])
        vc_group_columns = {
            'chromosome'  : row[22 - 1],
            'start_posn'  : row[23 - 1],
            'end_posn'    : row[24 - 1],
            'ref'         : vcf.parse('ref', row[25 - 1]),
            'dbsnp_id'    : vcf.parse('dbsnp_id', row[30 - 1]),

            # 'genotype_format' : row[36 - 1],
            'quality'         : row[33 - 1],
            'filter'          : row[34 - 1],

            # annovar columns
            'otherinfo'               : row[27 - 1],
            'func'                    : row[1 - 1],
            'gene'                    : row[2 - 1],
            'exonicfunc'              : row[3 - 1],
            'aachange'                : row[4 - 1],
            'conserved'               : row[5 - 1],
            '1000g2011may_all'        : row[8 - 1],
            'dbsnp135'                : row[9 - 1],
            'ljb_phylop_pred'         : row[12 - 1],
            'ljb_sift_pred'           : row[14 - 1],
            'ljb_polyphen2_pred'      : row[16 - 1],
            'ljb_lrt_pred'            : row[18 - 1],
            'ljb_mutationtaster_pred' : row[20 - 1],

            'ljb_gerppp'              : row[21 - 1],
            'segdup'                  : row[6 - 1],
            'esp5400_all'             : row[7 - 1],
            'avsift'                  : row[10 - 1],
            'ljb_phylop'              : row[11 - 1],
            'ljb_sift'                : row[13 - 1],
            'ljb_polyphen2'           : row[15 - 1],
            'ljb_lrt'                 : row[17 - 1],
            'ljb_mutationtaster'      : row[19 - 1],

            # vc_group_info columns
            # 'info_source'       : row[36 - 1],
            'ds'                : info.get('DS', False),
            'inbreeding_coeff'  : info.get('InbreedingCoeff'),
            'base_q_rank_sum'   : info.get('BaseQRankSum'),
            'mq_rank_sum'       : info.get('MQRankSum'),
            'read_pos_rank_sum' : info.get('ReadPosRankSum'),
            'dels'              : info.get('Dels'),
            'fs'                : info.get('FS'),
            'haplotype_score'   : info.get('HaplotypeScore'),
            'mq'                : info.get('MQ'),
            'qd'                : info.get('QD'),
            'sb'                : info.get('SB'),
            'vqslod'            : info.get('VQSLOD'),
            'an'                : info.get('AN'),
            'dp'                : info.get('DP'),
            'mq0'               : info.get('MQ0'),
            'culprit'           : info.get('culprit'),
        }
        vc_group_table = vc_group(vc_group_columns)

        alts = vcf.parse('alts', row[32 - 1])

        # for each vc_group_allele in (vc x alt alleles in vc_group)
        vc_group_allele_fields = [
            alts,
            # vc_group_allele_info
            get_list(info, 'AF'),
            get_list(info, 'MLEAF'),
            get_list(info, 'AC'),
            get_list(info, 'MLEAC'),
        ]

        vc_group_allele_columns = [{
            # 'vc_group_id' : vc_group_id,
            'allele'        : allele,
            'af'            : af,
            'mle_af'        : mle_af,
            'ac'            : ac,
            'mle_ac'        : mle_ac,
        } for allele, af, mle_af, ac, mle_ac in arity_zip(vc_group_allele_fields, table='vc_group', key="alt alleles in vc_group")]
        add_columns(vc_group_allele_columns, vc_group_table.vc_group_allele)

        ref_and_alts = as_list(vc_group_columns['ref']) + alts

        # for each vc in vc_group
        for genotype in [vcf.parse('genotype', row[gf]) for gf in xrange(37 - 1, 48)]:
            # vc_columns['genotype_source'] = row[gf]
            vc_columns = {
                # 'vc_group_id' : vc_group_table.lastrowid,
                'zygosity'    : row[27 - 1],
            }

            patient_columns = {
            }

            # vc_columns['patient_id'] = patient_table.lastrowid
            if not (type(genotype) == tuple and genotype[0] == (None, None)):
                ((allele1_idx, allele2_idx), vc_columns['phased']) = genotype['GT'] 
                vc_columns['allele1'] = ref_and_alts[allele1_idx]
                vc_columns['allele2'] = ref_and_alts[allele2_idx]
                vc_columns['read_depth'] = genotype.get('DP')
                vc_columns['genotype_quality'] = genotype.get('GQ')
                vc_table = vc(vc_columns)
                vc_group_table.vc.append(vc_table)
                
                # for each vc_genotype in (alleles in vc_group x alleles in vc_group x vc)
                vc_genotype_fields = [
                    vcf.ordered_alleles(vc_group_columns['ref'], alts), 
                    as_list(genotype.get('PL')),
                ]
                vc_genotype_columns = [{
                    # 'vc_id': vc_id,
                    'allele1': vc_genotype_allele1,
                    'allele2': vc_genotype_allele2,
                    'phred_likelihood': phred_likelihood,
                } for (vc_genotype_allele1, vc_genotype_allele2), phred_likelihood in arity_zip(vc_genotype_fields, table='vc_genotype', key="biallelic genotypes in vc_group")]
                add_columns(vc_genotype_columns, vc_table.vc_genotype)

                # for each vc_allele in (vc x alleles in ref, alts)
                vc_allele_fields = [
                    ref_and_alts,
                    get_list(genotype, 'AD'),
                ]
                vc_allele_columns = [{
                    # 'vc_id': vc_id,
                    'allele': allele,
                    'allelic_depth': allelic_depth,
                } for allele, allelic_depth in arity_zip(vc_allele_fields, table='vc_allele', key="ref and alt alleles in vc_group")]
                add_columns(vc_allele_columns, vc_table.vc_allele)

        yield vc_group_table
    input.close() 
def load_genome_summary(db, input, delim=",", quote='"', dry_run=False, records=None, quiet=False, autocommit=False):
    # this script will run properly on InnoDB engine without autocommit; sadly, such is not the case for NDB, where we get 
    # the error:
    # Got temporary error 233 'Out of operation records in transaction coordinator (increase MaxNoOfConcurrentOperations)' from NDBCLUSTER 
    # db.autocommit(True)

    def insert_wrapper(table, dic_or_dics, do_insert):
        if not dry_run:
            # try:
            return do_insert(dic_or_dics)
            # except Exception as e:
            #     msg = e.message if e.message else e.__str__()
                # raise type(e)(msg + " at line {lineno}".format(lineno=1))
        if not quiet:
            print "insert into {table} {dic_or_dics}".format(table=table.name, dic_or_dics=dic_or_dics)
    def insert_many(table, values):
        insert_wrapper(table, values, lambda vs: table.insert_many(values=vs))
    def insert(table, dic):
        insert_wrapper(table, dic, lambda d: table.insert(d))

    def arity_zip(args, error=None, table=None, key=None):
        if error is None:
            error = "Number of {table} columns don't all match the number of {key}; " + \
                    "skipping insertion into {table} at line {lineno}"
        return check_arity_zip(args, error.format(lineno=1, table=table.name, key=key))

    c = db.cursor()
    if autocommit:
        c.execute("""SET autocommit = 1;""")
    else:
        c.execute("""SET autocommit = 0;""")

    csv_input = csv.reader(input, delimiter=delim, quotechar=quote)

    vc_group_table = sql.table.oursql('vc_group', cursor=c)
    vc_table = sql.table.oursql('vc', cursor=c)                             
    patient_table = sql.table.oursql('patient', cursor=c)

    # table we can batch insert (i.e. tables whose lastrowid's we do not need)
    vc_group_allele_table = sql.table.oursql('vc_group_allele', cursor=c, fields=['vc_group_id', 'allele', 'af', 'mle_af', 'ac', 'mle_ac'])                
    vc_genotype_table = sql.table.oursql('vc_genotype', cursor=c, fields=['vc_id', 'allele1', 'allele2', 'phred_likelihood'])                    
    vc_allele_table = sql.table.oursql('vc_allele', cursor=c, fields=['vc_id', 'allele', 'allelic_depth'])

    # for each vc_group
    for row in csv_input:

        row = [None if f == '' else f for f in row] 

        info = vcf.parse('info', row[35 - 1])
        vc_group_columns = {
            'chromosome'  : row[22 - 1],
            'start_posn'  : row[23 - 1],
            'end_posn'    : row[24 - 1],
            'ref'         : vcf.parse('ref', row[25 - 1]),
            'dbsnp_id'    : vcf.parse('dbsnp_id', row[30 - 1]),

            # 'genotype_format' : row[36 - 1],
            'quality'         : row[33 - 1],
            'filter'          : row[34 - 1],

            # annovar columns
            'otherinfo'               : row[27 - 1],
            'func'                    : row[1 - 1],
            'gene'                    : row[2 - 1],
            'exonicfunc'              : row[3 - 1],
            'aachange'                : row[4 - 1],
            'conserved'               : row[5 - 1],
            '1000g2011may_all'        : row[8 - 1],
            'dbsnp135'                : row[9 - 1],
            'ljb_phylop_pred'         : row[12 - 1],
            'ljb_sift_pred'           : row[14 - 1],
            'ljb_polyphen2_pred'      : row[16 - 1],
            'ljb_lrt_pred'            : row[18 - 1],
            'ljb_mutationtaster_pred' : row[20 - 1],

            'ljb_gerppp'              : row[21 - 1],
            'segdup'                  : row[6 - 1],
            'esp5400_all'             : row[7 - 1],
            'avsift'                  : row[10 - 1],
            'ljb_phylop'              : row[11 - 1],
            'ljb_sift'                : row[13 - 1],
            'ljb_polyphen2'           : row[15 - 1],
            'ljb_lrt'                 : row[17 - 1],
            'ljb_mutationtaster'      : row[19 - 1],

            # vc_group_info columns
            # 'info_source'       : row[36 - 1],
            'ds'                : info.get('DS', False),
            'inbreeding_coeff'  : info.get('InbreedingCoeff'),
            'base_q_rank_sum'   : info.get('BaseQRankSum'),
            'mq_rank_sum'       : info.get('MQRankSum'),
            'read_pos_rank_sum' : info.get('ReadPosRankSum'),
            'dels'              : info.get('Dels'),
            'fs'                : info.get('FS'),
            'haplotype_score'   : info.get('HaplotypeScore'),
            'mq'                : info.get('MQ'),
            'qd'                : info.get('QD'),
            'sb'                : info.get('SB'),
            'vqslod'            : info.get('VQSLOD'),
            'an'                : info.get('AN'),
            'dp'                : info.get('DP'),
            'mq0'               : info.get('MQ0'),
            'culprit'           : info.get('culprit'),
        }
        insert(vc_group_table, vc_group_columns)

        alts = vcf.parse('alts', row[32 - 1])

        # for each vc_group_allele in (vc x alt alleles in vc_group)
        vc_group_allele_fields = [
            alts,
            # vc_group_allele_info
            get_list(info, 'AF'),
            get_list(info, 'MLEAF'),
            get_list(info, 'AC'),
            get_list(info, 'MLEAC'),
        ]
        insert_many(vc_group_allele_table, values=[[ vc_group_table.lastrowid, allele, af, mle_af, ac, mle_ac ] 
            for allele, af, mle_af, ac, mle_ac in arity_zip(vc_group_allele_fields, table=vc_group_table, key="alt alleles in vc_group")])

        vc_columns = {
            'vc_group_id' : vc_group_table.lastrowid,
            'zygosity'    : row[27 - 1],
        }

        ref_and_alts = as_list(vc_group_columns['ref']) + alts

        # for each vc in vc_group
        for genotype in [vcf.parse('genotype', row[gf]) for gf in xrange(37 - 1, 48)]:
            # vc_columns['genotype_source'] = row[gf]

            patient_columns = {
            }
            insert(patient_table, patient_columns)

            vc_columns['patient_id'] = patient_table.lastrowid
            if not (type(genotype) == tuple and genotype[0] == (None, None)):
                ((allele1_idx, allele2_idx), vc_columns['phased']) = genotype['GT'] 
                vc_columns['allele1'] = ref_and_alts[allele1_idx]
                vc_columns['allele2'] = ref_and_alts[allele2_idx]
                vc_columns['read_depth'] = genotype.get('DP')
                vc_columns['genotype_quality'] = genotype.get('GQ')
                insert(vc_table, vc_columns)
                
                # for each vc_genotype in (alleles in vc_group x alleles in vc_group x vc)
                vc_genotype_fields = [
                    vcf.ordered_alleles(vc_group_columns['ref'], alts), 
                    as_list(genotype.get('PL')),
                ]
                insert_many(vc_genotype_table, values=[[vc_table.lastrowid, vc_genotype_allele1, vc_genotype_allele2, phred_likelihood]
                    for (vc_genotype_allele1, vc_genotype_allele2), phred_likelihood in arity_zip(vc_genotype_fields, table=vc_genotype_table, key="biallelic genotypes in vc_group")])

                # for each vc_allele in (vc x alleles in ref, alts)
                vc_allele_fields = [
                    ref_and_alts,
                    get_list(genotype, 'AD'),
                ]
                insert_many(vc_allele_table, values=[[vc_table.lastrowid, allele, allelic_depth]
                    for allele, allelic_depth in arity_zip(vc_allele_fields, table=vc_allele_table, key="ref and alt alleles in vc_group")])

    vc_group_allele_table.flush_buffer()
    vc_genotype_table.flush_buffer()
    vc_allele_table.flush_buffer()
    db.commit()
    c.close()
예제 #4
0
def main():
	breakends = vcf.parse('../data/test.vcf', '../data/toto.lengths')
	print breakends
	print breakends.avg()
	breakends.validate()
예제 #5
0
def main():
    breakends = vcf.parse('../data/test.vcf', '../data/toto.lengths')
    print breakends
    print breakends.avg()
    breakends.validate()