Exemplo n.º 1
0
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                try:
                    s_index = line.rstrip().split('\t').index(sample)
                except ValueError:
                    sys.stderr.write(
                        "Please input valid VCF, format field for " + sample +
                        " not found in VCF")
                    sys.exit(1)
                line = '\t'.join(
                    map(str,
                        line.rstrip().split('\t')[:9] + [sample]))
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float',
                               'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write("Input a valid sample name: " + sample +
                             " not found in a provided VCF")
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            if "CN" not in v[8]:
                v[8] = v[8] + ":CN"
                v[9] = v[9] + ":" + str(cn_list[i])
            else:
                cn_index = v[8].rstrip().split(":").index("CN")
                gts = v[9].rstrip().split(":")
                gts[cn_index] = str(cn_list[i])
                v[9] = ":".join(gts)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
Exemplo n.º 2
0
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    cn_bad = -1 in cn_list
    if cn_bad:
        sys.stderr.write(
            'cnvnator was unable to produce a copynumber value for one or more chromosomes. All copynumber values will be set to missing.'
        )
        cn_list = ['.'] * len(cn_list)
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                try:
                    s_index = line.rstrip().split('\t').index(sample)
                except ValueError:
                    sys.stderr.write(
                        "Please input valid VCF, format field for {0} not found in VCF"
                        .format(sample))
                    sys.exit(1)
                line = '\t'.join(
                    map(str,
                        line.rstrip().split('\t')[:9] + [sample]))
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float',
                               'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write(
                "Input a valid sample name: {0} not found in a provided VCF".
                format(sample))
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            update_line_copynumber(v, cn_list, i)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
Exemplo n.º 3
0
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                  try:
                        s_index = line.rstrip().split('\t').index(sample)
                  except ValueError:
                        sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF")
                        sys.exit(1)
                  line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample]))
                  header.append(line)
                  continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF")
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            if "CN" not in v[8]:
                v[8] = v[8] + ":CN"
                v[9] = v[9] + ":" + str(cn_list[i])
            else:
                cn_index = v[8].rstrip().split(":").index("CN")
                gts = v[9].rstrip().split(":")
                gts[cn_index] = str(cn_list[i])
                v[9] = ":".join(gts)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
Exemplo n.º 4
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float',
                             'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float',
                             'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float',
                               'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format(
                        "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(
                var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Exemplo n.º 5
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}
    
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)
        
        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF'))<0.01:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            recdf=recluster(df)
            if ct==1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
              recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
                    var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Exemplo n.º 6
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    batch = dict()
    if batch_file is not None:
        for line in batch_file:
            fields = line.rstrip().split('\t')
            if fields[1] == 'None':
                raise RuntimeError('Batch file contains a batch label of None. This label is reserved.')
            batch[fields[0]] = fields[1]

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype')
                vcf.add_format('GTO', 1, 'String', 'Genotype before refinement')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL prior to reclassification
        # DUPs can be quite complicated in their allelic structure
        # and thus less amenable to refinement by clustering in many cases
        # INV and BNDs are also unclear.
        # See earlier commits for code of previous attempts to refine these.
        if svtype not in ['DEL', 'MEI']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex, batch)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                g = var.genotype(s)
                g.set_format("GTO", g.get_format("GT"))
                g.set_format("GQO", g.get_format("GQ"))
                if s in recdf.index:
                    var.genotype(s).set_format("GT", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GT", "./.")
                    var.genotype(s).set_format("GQ", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return