Пример #1
0
def sname_filter(input_stream, filter_file, output_stream, complement):
    '''
    This reads a VCF stream, determines if the line overlaps any from the filter_file by sname and outputs.
    '''
    filter_list = load_filter_file(filter_file)

    vcf = Vcf()
    in_header = True
    header_lines = list()
    sample_list = None
    for line in input_stream:
        if in_header:
            header_lines.append(line)
            if line[0:6] == '#CHROM':
                in_header = False
                vcf.add_header(header_lines)
                vcf.add_info('FOUND', '.', 'String',
                             'Variant id in other file')
                output_stream.write(vcf.get_header() + '\n')
        else:
            v = Variant(line.rstrip().split('\t'), vcf)
            sname_set = set_from_string(v.get_info('SNAME'))
            found = overlapping_ids(sname_set, filter_list)
            if bool(found) != complement:
                v.set_info('FOUND', ','.join(found))
                output_stream.write(v.get_var_string() + '\n')
Пример #2
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('SIL_GT_AVG', '1', 'Float',
                             'Average silhouette of genotype clusters')
                #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
                vcf_out.write(vcf.get_header() + '\n')

        var = Variant(line.rstrip().split('\t'), vcf)
        df = load_df(var, sex)
        df1 = get_silhouette(df)

        sil_avg = df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
        #sil_ind=df1.loc[:, 'sil_gt']
        var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
        vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')

        if ct == 1:
            df1.to_csv(outf, header=True)
            ct += 1
        else:
            df1.to_csv(outf, header=False)

    vcf_out.close()
    vcf_in.close()
    outf.close()
    gender_file.close()

    return
Пример #3
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters')
                #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
                vcf_out.write(vcf.get_header() + '\n')

        var = Variant(line.rstrip().split('\t'), vcf)
        df=load_df(var,  sex)
        df1=get_silhouette(df)

        sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
        #sil_ind=df1.loc[:, 'sil_gt']
        var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
        vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        
        if ct==1:
            df1.to_csv(outf, header=True)
            ct += 1
        else:
            df1.to_csv(outf, header=False)

    vcf_out.close()
    vcf_in.close()
    outf.close()
    gender_file.close()

    return
Пример #4
0
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0
            sum_sq = 0.0

            for gt in var.genotypes():
                gt_string = gt.get_format('GT')

                if '.' not in gt_string:
                    indexes = self.numeric_alleles(gt_string)

                    for i in indexes:
                        alleles[i] += 1

                    # iterate the number of non-reference samples
                    if sum(indexes) > 0:
                        num_samp += 1
                        try:
                            sum_sq += float(gt.get_format('SQ'))
                        except KeyError:
                            pass

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            if num_samp > 0:
                msq = '%0.2f' % (sum_sq / num_samp)
            else:
                msq = '.'
            var.info['MSQ'] = msq

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
Пример #5
0
def l_cluster_by_line(file_name,
                      tempdir,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False,
                      include_genotypes=False,
                      weighting_scheme='unweighted'):

    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out = sys.stdout

    with InputStream(file_name, tempdir) as vcf_stream:

        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:

            if in_header:

                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    for headline in header:
                        if headline[:8] == '##SAMPLE':
                            sample_order.append(headline.rstrip()[13:-1])
                    hline = ''
                    if include_genotypes:
                        v.extend(sample_order)
                        hline = '\t'.join(v)
                    else:
                        v = v[:8]
                        hline = '\t'.join(v)
                    header.append(hline)
                    in_header = False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String',
                                 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header() + '\n')
                    else:
                        vcf_out.write(vcf.get_header(False) + '\n')

                continue

            b = Breakpoint(l_bp.parse_vcf_record(line),
                           percent_slop=percent_slop,
                           fixed_slop=fixed_slop)
            if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and
                                    (b.left.chrom == BP_chr_l) and
                                    (b.sv_type == BP_sv_type)):
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                                 vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_sv_type = b.sv_type
                BP_chr_l = b.left.chrom

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                             vcf_out, include_genotypes, weighting_scheme)
Пример #6
0
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
    # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
    bList = list()
    headerObj = Vcf()  #co-opt the VCF header object
    if cohort_name is None:
        cohort_name = str(str(bFile).split('/')[-1])

    if bFile == "stdin":
        bData = sys.stdin
    elif bFile.endswith('.gz'):
        bData = gzip.open(bFile, 'rb')
    else:
        bData = open(bFile, 'r')
    for bLine in bData:
        if bLine.startswith(pass_prefix):
            continue
        bentry = Bedpe(bLine.rstrip().split('\t'))
        if bentry.af is None:
            sys.stderr.write(
                'No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n'
            )
            sys.exit(1)
        bList.append(bentry)

    if aFile == "stdin":
        aData = sys.stdin
    elif aFile.endswith('.gz'):
        aData = gzip.open(aFile, 'rb')
    else:
        aData = open(aFile, 'r')
    in_header = True
    header_lines = []
    sample_list = None
    for aLine in aData:
        if pass_prefix is not None and aLine.startswith(pass_prefix):
            if aLine[0] == '#' and aLine[1] != '#':
                sample_list = aLine.rstrip().split('\t', 14)[-1]
            else:
                header_lines.append(aLine)
            continue
        else:
            if in_header == True:
                headerObj.add_header(header_lines)
                headerObj.add_info(
                    cohort_name + '_AF', '.', 'Float',
                    'Allele frequency(ies) for matching variants found in the '
                    + cohort_name + ' vcf' + ' (' +
                    str(str(bFile).split('/')[-1]) + ')')
                headerObj.add_info(
                    cohort_name + '_VarID', '.', 'Integer',
                    'List of Variant ID(s) for matching variants found in the '
                    + cohort_name + ' vcf' + ' (' +
                    str(str(bFile).split('/')[-1]) + ')')

                header = headerObj.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join([
                        '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                        'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                        'FILTER', 'INFO_A', 'INFO_B', sample_list
                    ]) + '\n')
                else:
                    bedpe_out.write('\t'.join([
                        '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                        'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                        'FILTER', 'INFO_A', 'INFO_B'
                    ]) + '\n')
                in_header = False
            a = Bedpe(aLine.rstrip().split('\t'))
            if a.af is None:
                sys.stderr.write(
                    'No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n'
                )
                sys.exit(1)
            for b in bList:
                add(a, b, max_distance)
            bedpe_out.write(get_var_string(a, cohort_name) + '\n')
Пример #7
0
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
    # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
    bList = list()
    headerObj=Vcf() #co-opt the VCF header object
    if cohort_name is None:
        cohort_name=str(str(bFile).split('/')[-1])
        
    if bFile == "stdin":
        bData = sys.stdin
    elif bFile.endswith('.gz'):
        bData = gzip.open(bFile, 'rb')
    else:
        bData = open(bFile, 'r')
    for bLine in bData:
        if bLine.startswith(pass_prefix):
            continue
        bentry = Bedpe(bLine.rstrip().split('\t'))
        if bentry.af is None:
            sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
            sys.exit(1)
        bList.append(bentry)
    
    if aFile == "stdin":
        aData = sys.stdin
    elif aFile.endswith('.gz'):
        aData = gzip.open(aFile, 'rb')
    else:
        aData = open(aFile, 'r')
    in_header=True    
    header_lines = []
    sample_list = None
    for aLine in aData:
        if pass_prefix is not None and aLine.startswith(pass_prefix):
            if aLine[0] == '#' and aLine[1] != '#':
                sample_list = aLine.rstrip().split('\t', 14)[-1]
            else:
                header_lines.append(aLine)
            continue
        else:
            if in_header == True:
                headerObj.add_header(header_lines)
                headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
                headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )

                header = headerObj.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')                
                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(['#CHROM_A',
                                               'START_A',
                                               'END_A',
                                               'CHROM_B',
                                               'START_B',
                                               'END_B',
                                               'ID',
                                               'QUAL',
                                               'STRAND_A',
                                               'STRAND_B',
                                               'TYPE',
                                               'FILTER',
                                               'INFO_A','INFO_B',
                                               sample_list]
                                             ) + '\n')
                else:
                    bedpe_out.write('\t'.join(['#CHROM_A',
                                               'START_A',
                                               'END_A',
                                               'CHROM_B',
                                               'START_B',
                                               'END_B',
                                               'ID',
                                               'QUAL',
                                               'STRAND_A',
                                               'STRAND_B',
                                               'TYPE',
                                               'FILTER',
                                               'INFO_A','INFO_B']
                                              ) + '\n')
                in_header=False
            a = Bedpe(aLine.rstrip().split('\t'))
            if a.af is None:
                sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
                sys.exit(1)
            for b in bList:
                add(a,b,max_distance)
            bedpe_out.write(get_var_string(a, cohort_name))
Пример #8
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float',
                             'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float',
                             'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float',
                               'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format(
                        "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(
                var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Пример #9
0
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0
            sum_sq = 0.0

            for gt in var.genotypes():
                gt_string = gt.get_format('GT')

                if '.' not in gt_string:
                    indexes = self.numeric_alleles(gt_string)

                    for i in indexes:
                        alleles[i] += 1

                    # iterate the number of non-reference samples
                    if sum(indexes) > 0:
                        num_samp += 1
                        try:
                            sum_sq += float(gt.get_format('SQ'))
                        except KeyError:
                            pass

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            if num_samp > 0:
                msq = '%0.2f' % (sum_sq / num_samp)
            else:
                msq = '.'
            var.info['MSQ'] = msq

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
Пример #10
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}
    
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)
        
        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF'))<0.01:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            recdf=recluster(df)
            if ct==1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
              recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
                    var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Пример #11
0
def run_from_args(args):

  vcf = Vcf()
  vcf_out=sys.stdout
  in_header = True
  header_lines = list()
  with su.InputStream(args.manta_vcf) as input_stream:
    for line in input_stream:
      if in_header:
        header_lines.append(line)
        if line[0:6] == '#CHROM':
          in_header=False
          vcf.add_header(header_lines)
          vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist')
          vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist')
          vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--')
          vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples')
          vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples')
          vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples')
          vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length')
          vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants')
          vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants')
          vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant')
          vcf_out.write(vcf.get_header()+'\n')
      else:
        v = Variant(line.rstrip().split('\t'), vcf)
        convert_variant(v, args.max_ins)
        vcf_out.write(v.get_var_string()+"\n")
Пример #12
0
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'):

    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out=sys.stdout

    with InputStream(file_name) as vcf_stream:

        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:

            if in_header:

                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v=line.rstrip().split('\t')
                    for headline in header:
                        if headline[:8] == '##SAMPLE':
                            sample_order.append(headline.rstrip()[13:-1])
                    hline=''
                    if include_genotypes :
                        v.extend(sample_order)
                        hline='\t'.join(v)
                    else :
                        v=v[:8]
                        hline='\t'.join(v)
                    header.append(hline)
                    in_header=False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header()+'\n')
                    else:
                        vcf_out.write(vcf.get_header(False)+'\n')

                continue

            b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop)
            if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)):
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_sv_type = b.sv_type
                BP_chr_l = b.left.chrom

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
Пример #13
0
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf, fixed_genotypes=True)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0

            gt = [var.genotype(s).get_format('GT') for s in var.sample_list]
            for gt_string in gt:

                if '.' in  gt_string:
                    continue
                gt = gt_string.split('/')
                if len(gt) == 1:
                    gt = gt_string.split('|')
                gt = map(int, gt)

                for i in xrange(len(gt)):
                    alleles[gt[i]] += 1

                # iterate the number of non-reference samples
                if sum(gt) > 0:
                    num_samp += 1

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            var.info['MSQ'] = self.calc_msq(var)

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
Пример #14
0
def l_cluster_by_line(file_name,
                      tempdir,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False,
                      include_genotypes=False,
                      weighting_scheme='unweighted'):
    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out = sys.stdout

    with InputStream(file_name, tempdir) as vcf_stream:
        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v = line.rstrip().split(
                        '\t')  # #CHROM line split -> list -D
                    for headline in header:
                        if headline[:8] == '##SAMPLE' and headline.rstrip(
                        )[13:-1] != 'VARIOUS':
                            sample_order.append(
                                headline.rstrip()[13:-1]
                            )  # maybe add sample name to samplr_order list. -D
                    hline = ''  # Parsed #CHROM line from 'v' -D
                    if include_genotypes:
                        v = v[:9]  # Remove possible VARIOUS -D
                        v.extend(sample_order)
                        hline = '\t'.join(v)
                    else:
                        v = v[:8]  # No FORMAT field here. -D
                        hline = '\t'.join(v)
                    header.append(hline)
                    in_header = False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String',
                                 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header() + '\n')
                    else:
                        vcf_out.write(
                            vcf.get_header(include_samples=False) +
                            '\n')  # Not including samples here. -D

                continue

            # Header is now parsed, then the main dish. -D

            b = Breakpoint(
                l_bp.parse_vcf_record(line),
                percent_slop=percent_slop,
                fixed_slop=fixed_slop)  # percent_slop and fixed_slop is 0. -D
            if (len(BP_l) == 0) or (
                (b.left.start <= BP_max_end_l) and
                (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)
            ):  # Same chrom svtype and start is small than previous end. -D
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                                 vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                             vcf_out, include_genotypes, weighting_scheme)
Пример #15
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    batch = dict()
    if batch_file is not None:
        for line in batch_file:
            fields = line.rstrip().split('\t')
            if fields[1] == 'None':
                raise RuntimeError('Batch file contains a batch label of None. This label is reserved.')
            batch[fields[0]] = fields[1]

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype')
                vcf.add_format('GTO', 1, 'String', 'Genotype before refinement')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL prior to reclassification
        # DUPs can be quite complicated in their allelic structure
        # and thus less amenable to refinement by clustering in many cases
        # INV and BNDs are also unclear.
        # See earlier commits for code of previous attempts to refine these.
        if svtype not in ['DEL', 'MEI']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex, batch)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                g = var.genotype(s)
                g.set_format("GTO", g.get_format("GT"))
                g.set_format("GQO", g.get_format("GQ"))
                if s in recdf.index:
                    var.genotype(s).set_format("GT", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GT", "./.")
                    var.genotype(s).set_format("GQ", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return