Пример #1
0
def strelka_indel_af(vcf_file):
    """Print basic info for each indel variant in strelka vcf and adds
    indel AF for each sample
    """

    if vcf_file == "-":
        vcfreader = vcf.VCFReader(sys.stdin)
    else:
        assert os.path.exists(vcf_file)
        vcfreader = vcf.VCFReader(filename=vcf_file)

    # NOTE: pyvcf swallows first line, i.e. expects a header!
    print "CHROM\tPOS\t{}".format('\t'.join(vcfreader.samples))
    for var in vcfreader:
        assert var.is_indel
        # print minimal variant info
        print "{}\t{}".format(var.CHROM, var.POS),
        for s in range(len(var.samples)):
            tar = [int(x) for x in var.samples[s].data.TAR]
            tir = [int(x) for x in var.samples[s].data.TIR]
            #print "tar", tar, " tir", tir
            tar = sum(tar)
            tir = sum(tir)
            print "\t{}".format(tir/float(tir+tar)),
        print
Пример #2
0
def filter_by_background(in_vcf, full_vcf, background, data):
    """Filter SV calls also present in background samples.

    Skips filtering of inversions, which are not characterized differently
    between cases and controls in test datasets.
    """
    Filter = collections.namedtuple('Filter', ['id', 'desc'])
    back_filter = Filter(id='InBackground',
                         desc='Rejected due to presence in background sample')
    out_file = "%s-filter.vcf" % utils.splitext_plus(in_vcf)[0]
    if not utils.file_uptodate(out_file, in_vcf) and not utils.file_uptodate(
            out_file + ".vcf.gz", in_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                reader = vcf.VCFReader(filename=in_vcf)
                reader.filters["InBackground"] = back_filter
                full_reader = vcf.VCFReader(filename=full_vcf)
                writer = vcf.VCFWriter(out_handle, template=reader)
                for out_rec, rec in zip(reader, full_reader):
                    rec_type = rec.genotype(dd.get_sample_name(data)).gt_type
                    if rec_type == 0 or any(rec_type == rec.genotype(
                            dd.get_sample_name(x)).gt_type
                                            for x in background):
                        out_rec.add_filter("InBackground")
                    writer.write_record(out_rec)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #3
0
def main_maize(ki11_snps=None, dirs=None):
    if ki11_snps is None:
        ki11_snps = defaultdict(lambda: {})  # chrom -> pos -> VCF record
        debug_count = 0
        for r in vcf.VCFReader(open('B73Ki11.q20.vcf')):
            ki11_snps[r.CHROM][r.POS] = r
            #if debug_count > 100000: break
            debug_count += 1

    print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.'

    ki11_shortread_cov = defaultdict(
        lambda: {})  # chrom -> pos -> short read cov
    # read the raw Ki11 pileup to get coverage in places where no SNPs were called
    for r in sp.MPileUpReader('Ki11.raw.mpileup'):
        if r is not None:
            ki11_shortread_cov[r.chr][r.pos] = r.cov
    print >> sys.stderr, "Fnished reading Ki11.raw.mpileup."

    repeat_by_chrom = {}
    # read the Tandem Repeat Finder summary
    for r in DictReader(open('B73_RefV4.fa.repeat_list.txt'), delimiter='\t'):
        if r['chrom'] not in repeat_by_chrom:
            repeat_by_chrom[r['chrom']] = IntervalTree()
        repeat_by_chrom[r['chrom']].add(int(r['start0']), int(r['end1']))

    print >> sys.stderr, 'Finished reading B73_RefV4.fa.repeat_list.txt.'

    FIELDS = [
        'dir', 'chrom', 'pos', 'ref', 'alt_Short', 'alt_PB', 'in_Short',
        'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP'
    ]
    out_f = open('evaled.isophase_SNP.txt', 'w')
    writer_f = DictWriter(out_f, FIELDS, delimiter='\t')
    writer_f.writeheader()

    debug_count = 0
    if dirs is None: dirs = glob.glob('by_loci/*size*/')
    for d1 in dirs:
        #if debug_count > 100: break
        debug_count += 1
        mpileup = os.path.join(d1, 'ccs.mpileup')
        mapfile = os.path.join(d1, 'fake.mapping.txt')
        vcffile = os.path.join(d1, 'phased.partial.vcf')
        nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND')
        if not os.path.exists(vcffile):
            assert os.path.exists(nosnp)
            print >> sys.stderr, (
                'Skipping {0} because no SNPs found.').format(d1)
        else:
            print >> sys.stderr, ('Evaluating {0}.').format(d1)
            good_positions, cov_at_pos = get_positions_to_recover(
                mapfile, mpileup, ki11_snps, min_cov=30
            )  # use lower min cov here becuz a few close cases where BQ filtering lowered cov
            name = d1.split('/')[1]
            eval_isophase(vcffile, ki11_snps, good_positions, cov_at_pos,
                          repeat_by_chrom, ki11_shortread_cov, writer_f, name)

    out_f.close()
    return ki11_snps
Пример #4
0
    def parse(cls, vcf_path):
        if hasattr(vcf_path, "read"):
            h = vcf_path
        else:
            if vcf_path.endswith(".gz"):
                h = gzip.open(vcf_path)
            else:
                h = open(vcf_path)

        try:
            variantes = vcf.VCFReader(h)
            for v in variantes:
                effects = [
                    SnpeffEffect.read(x)
                    for x in (v.INFO["ANN"] if "ANN" in v.INFO else [])
                ]
                intergenic = [(i, x) for i, x in enumerate(effects)
                              if "intragenic_variant" in x.annotation]
                if intergenic:
                    i, intergenic = intergenic[0]
                    if (("upstream_gene_variant" in effects[0].annotation) or
                        ("downstream_gene_variant" in effects[0].annotation)):
                        effects = [effects[i]] + effects[:i - 1] + effects[i:]
                yield (v, effects)
        finally:
            h.close()
Пример #5
0
def main_maize(ki11_snps=None, dirs=None):
    if ki11_snps is None:
        ki11_snps = defaultdict(lambda: {})  # chrom -> pos -> VCF record
        for r in vcf.VCFReader(open('B73Ki11.q20.vcf')):
            ki11_snps[r.CHROM][r.POS] = r

    print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.'
    out_f = open('evaled.isophase_SNP.txt', 'w')
    out_f.write('dir\tchrom\tpos\tref\talt_Short\talt_PB\tin_Short\tin_PB\n')
    if dirs is None: dirs = glob.glob('by_loci/*size*/')
    for d1 in dirs:
        mpileup = os.path.join(d1, 'ccs.mpileup')
        mapfile = os.path.join(d1, 'fake.mapping.txt')
        vcffile = os.path.join(d1, 'phased.partial.vcf')
        nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND')
        if not os.path.exists(vcffile):
            assert os.path.exists(nosnp)
            print >> sys.stderr, (
                'Skipping {0} because no SNPs found.').format(d1)
        else:
            print >> sys.stderr, ('Evaluating {0}.').format(d1)
            good_positions = get_positions_to_recover(
                mapfile, mpileup, ki11_snps, min_cov=30
            )  # use lower min cov here becuz a few close cases where BQ filtering lowered cov
            name = d1.split('/')[1]
            eval_isophase(vcffile, ki11_snps, good_positions, out_f, name)

    out_f.close()
    return ki11_snps
def main():
    """main function (shutup pylint)
    """

    assert len(sys.argv) == 2
    f = sys.argv[1]
    assert os.path.exists(f)

    print_vcf_header(sys.stdout)

    vcfreader = vcf.VCFReader(filename=f)
    for v in vcfreader:
        assert len(v.ALT) == len(v.INFO['TYPE'])

        for i in range(len(v.ALT)):
            t = v.INFO['TYPE'][i]
            a = str(v.ALT[i])
            if t == 'snp':
                print_snp(sys.stdout, v.CHROM, v.POS, v.REF, a, v.QUAL, "snp")
            elif t == 'mnp':
                assert len(v.REF) > 1
                assert len(v.REF) == len(a)
                for i in range(len(v.REF)):
                    print_snp(sys.stdout, v.CHROM, v.POS + i, v.REF[i], a[i],
                              v.QUAL, "mnp")
            else:
                # FIXME handle indels
                pass
def main_brangus(unzip_snps=None):
    if unzip_snps is None:
        unzip_snps = defaultdict(lambda : {})
        for r in vcf.VCFReader(open('Brangus.unzip.vcf')):
            unzip_snps[r.CHROM][r.POS] = r

    print >> sys.stderr, 'Finished reading Brangus.unzip.vcf.'
    out_f = open('evaled.isophase.txt', 'w')
    out_f.write('dir\tchrom\tpos\tref\talt_g\talt_i\tin_g\tin_i\n')
    dirs = glob.glob('by_loci/*size*/')
    for d1 in dirs:
        mpileup = os.path.join(d1, 'ccs.mpileup')
        mapfile = os.path.join(d1, 'fake.mapping.txt')
        vcffile = os.path.join(d1, 'phased.partial.vcf')
        nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND')
        if not os.path.exists(vcffile):
            assert os.path.exists(nosnp)
            print >> sys.stderr, ('Skipping {0} because no SNPs found.').format(d1)
        else:
            print >> sys.stderr, ('Evaluating {0}.').format(d1)
            good_positions = get_positions_to_recover(mapfile, mpileup, unzip_snps, min_cov=40)
            name = d1.split('/')[1]
            eval_isophase(vcffile, unzip_snps, good_positions, out_f, name)

    out_f.close()
    return
Пример #8
0
def eval_isophase(isophase_vcf, genome_snp, good_positions, out_f, name='NA'):
    for r in vcf.VCFReader(open(isophase_vcf)):
        r.CHROM = r.CHROM.split('|')[0]
        if (r.CHROM, r.POS) not in good_positions:
            alt_g = 'NA'
            in_g = 'N'
            in_i = 'Y'
        else:
            alt_g = genome_snp[r.CHROM][r.POS].ALT[0]
            in_g = 'Y'
            in_i = 'Y'
            good_positions.remove((r.CHROM, r.POS))
        out_f.write((
            '{name}\t{chrom}\t{pos}\t{ref}\t{alt_g}\t{alt_i}\t{in_g}\t{in_i}\n'
        ).format(name=name,
                 chrom=r.CHROM,
                 pos=r.POS,
                 ref=r.REF,
                 alt_g=alt_g,
                 alt_i=r.ALT[0],
                 in_g=in_g,
                 in_i=in_i))

    for chrom, pos in good_positions:
        r = genome_snp[chrom][pos]
        out_f.write(
            ('{name}\t{chrom}\t{pos}\t{ref}\t{alt_g}\tNA\tY\tN\n').format(
                name=name, chrom=chrom, pos=pos, ref=r.REF, alt_g=r.ALT[0]))
Пример #9
0
def _add_reject_flag(in_file, config):
    """Add REJECT flag to all records that aren't flagged somatic
    (SS=2)"""

    Filter = namedtuple('Filter', ['id', 'desc'])
    reject_filter = Filter(id='REJECT',
                           desc='Rejected as non-SOMATIC or by quality')
    # NOTE: PyVCF will write an uncompressed VCF
    base, ext = utils.splitext_plus(in_file)
    name = "rejectfix"
    out_file = "{0}-{1}{2}".format(base, name, ".vcf")

    if utils.file_exists(in_file):
        reader = vcf.VCFReader(filename=in_file)
        # Add info to the header of the reader
        reader.filters["REJECT"] = reject_filter
        with file_transaction(config, out_file) as tx_out_file:
            with open(tx_out_file, "wb") as handle:
                writer = vcf.VCFWriter(handle, template=reader)
                for record in reader:
                    if "SS" in record.INFO:
                        # VarScan encodes it as a string
                        # TODO: Set it as integer when cleaning

                        if record.INFO["SS"] != "2":
                            record.add_filter("REJECT")
                    writer.write_record(record)

        # Re-compress the file
        out_file = bgzip_and_index(out_file, config)
        move_vcf(in_file, "{0}.orig".format(in_file))
        move_vcf(out_file, in_file)
        with open(out_file, "w") as out_handle:
            out_handle.write("Moved to {0}".format(in_file))
Пример #10
0
def fp_validation(vcf_fn):
    path = vcf_fn.split(os.sep)
    vcf_id = path[-3] + ":" + path[-1]
    vars_file = vcf.VCFReader(open(vcf_fn))

    for record in vars_file:
        (pb_ao, pb_cov) = validate_variant(record, pacbio_bam,
                                           reference_pyfasta)
        (ts_ao, ts_cov) = validate_variant(record, ts_bam, reference_pyfasta)
        (unphased, hap_1,
         hap_2) = get_phased_counts_variant(record, crg_bam, reference_pyfasta)

        # see tenkit/bio_io.py for some examples of getting stuff out of vcfs
        row = {
            'vcf.ID': vcf_id,
            'chrom': record.CHROM,
            'pos': record.POS,
            'PacBio_ALT': pb_ao,
            'PacBio_COV': pb_cov
        }
        row.update({'TruSeq_ALT': ts_ao, 'TruSeq_COV': ts_cov})
        row.update({'Unphased_ALT': unphased[0], 'Unphased_COV': unphased[1]})
        row.update({
            'Hap1_ALT': hap_1[0],
            'Hap1_COV': hap_1[1],
            'Hap2_ALT': hap_2[0],
            'Hap2_COV': hap_2[1]
        })
        rows.append(row)
def eval_isophase_phaseswitch(isophase_vcf, config_file, out_f, name='NA'):

    _chr, _start, _end, _strand = read_config(config_file)

    reader = vcf.VCFReader(open(isophase_vcf))
    # record the first SNP for each isoform
    prev = {}  # sample -> CallData.GT (ex: '0|1')
    r = reader.next()
    for c in r.samples:
        prev[c.sample] = c.data.GT

    num_switch = 0

    for r in reader:
        for c in r.samples:
            if c.data.GT.find('|') == -1:
                continue  # ignore those with just one allele
            a, b = c.data.GT.split('|')
            if a == b:
                continue  # for now, ignore IsoPhase results that only uses one allele
            if prev[c.sample] != c.data.GT:
                num_switch += 1
            prev[c.sample] = c.data.GT

    out_f.write("{name}\t{chrom}\t{start}\t{end}\t{strand}\t{num_iso}\t{num_switch}\n".format(\
        name=name, chrom=_chr, start=_start, end=_end, strand=_strand,
        num_iso=len(r.samples), num_switch=num_switch))
def main_brangus(vcf_filename, out_filename, unzip_snps=None):
    if unzip_snps is None:
        unzip_snps = defaultdict(lambda : {})
        for r in vcf.VCFReader(open(vcf_filename)):
            unzip_snps[r.CHROM][r.POS] = r

    print('Finished reading ' + vcf_filename, file=sys.stderr)
    out_f = open(out_filename, 'w')
    FIELDS = ['dir', 'chrom', 'pos', 'strand', 'ref', 'alt_Short', 'alt_PB', 'in_Short', 'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP']
    writer = DictWriter(out_f, FIELDS, delimiter='\t')
    writer.writeheader()
    dirs = glob.glob('by_loci/*size*/')
    for d1 in dirs:
        mpileup = os.path.join(d1, 'ccs.mpileup')
        mapfile = os.path.join(d1, 'fake.mapping.txt')
        vcffile = os.path.join(d1, 'phased.partial.vcf')
        config  = os.path.join(d1, 'config')
        nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND')
        if not os.path.exists(vcffile):
            assert os.path.exists(nosnp)
            print(('Skipping {0} because no SNPs found.').format(d1), file=sys.stderr)
        else:
            print(('Evaluating {0}.').format(d1), file=sys.stderr)
            strand = 'NA' 
            if os.path.exists(config): # find the strand this gene family is on
                for line in open(config):
                    if line.startswith('ref_strand='): strand = line.strip().split('=')[1]
            good_positions, cov_at_pos = get_positions_to_recover(mapfile, mpileup, unzip_snps, min_cov=30)
            name = d1.split('/')[1]
            eval_isophase(vcffile, unzip_snps, good_positions, cov_at_pos, {}, {}, writer, name, strand)

    out_f.close()
    return
Пример #13
0
def detect_vcf_annotation(filepath):
    """Return the name of the annotation parser to be used on the given file

    Called: In the importer and in the project wizard to display the detected
    annotations.

    :return: "vep", "snpeff", None
    """
    if cm.is_gz_file(filepath):
        # Open .gz files in binary mode (See #84)
        device = open(filepath, "rb")
    else:
        device = open(filepath, "r")

    std_reader = vcf.VCFReader(device)
    # print(std_reader.metadata)

    if "VEP" in std_reader.metadata:
        if "CSQ" in std_reader.infos:
            device.close()
            return "vep"

    if "SnpEffVersion" in std_reader.metadata:
        if "ANN" in std_reader.infos:
            device.close()
            return "snpeff"
Пример #14
0
def vcf2csv(vcffile, csvfile):
    """FIXME:add-doc
    """

    
                                
    vcffh = vcf.VCFReader(filename=vcffile)
    fieldnames = ['Seq', 'Pos', 'Ref.Base', 'Var.Base', 'Qual', 'Allele Freq', 'Type', 'Homopolymer Length', 'Depth', 'Depth: ref fw and rev, var fw and rev']
    if csvfile == "-":
        csvfile = '/dev/stdout'
    with open(csvfile, 'wb') as csvfh:
        csvw = csv.DictWriter(csvfh, fieldnames=fieldnames)#, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvw.writeheader()
        for var in vcffh:
            assert len(var.ALT) == 1
            fields = [var.CHROM, var.POS, var.REF, var.ALT[0], var.QUAL, var.INFO['AF']]

            if var.is_indel:
                fields.extend(["INDEL", var.INFO['HRUN']])
            else:
                fields.extend(["SNV", "None"])
            fields.append(var.INFO['DP'])
            fields.append(','.join(str(x) for x in var.INFO['DP4']))
            row = dict(zip(fieldnames, fields))
            csvw.writerow(row)
Пример #15
0
def parse_vep_annotations_from_vcf(vcf_file_obj):
    """
    Iterate through the variants in a VEP annotated VCF, pull out annotation from CSQ field
    """

    r = vcf.VCFReader(vcf_file_obj)
    if "CSQ" not in r.infos:
        raise ValueError("CSQ field not found in %s header" % vcf_file_obj)
    csq_field_names = r.infos["CSQ"].desc.split("Format: ")[1].split("|")
    csq_field_names = map(lambda s: s.lower(), csq_field_names)

    for vcf_row in r:
        vep_annotations = []
        for i, per_transcript_csq_string in enumerate(vcf_row.INFO["CSQ"]):
            csq_values = per_transcript_csq_string.split('|')

            # sanity-check the csq_values
            if len(csq_values) != len(csq_field_names):
                raise ValueError("CSQ per-transcript string %s contains %s values instead of %s:\n%s" % (
                    i, len(csq_values), len(csq_field_names), per_transcript_csq_string))

            vep_annotation = dict(zip(csq_field_names, csq_values))
            vep_annotation['is_nmd'] = "NMD_transcript_variant" in csq_values
            # 2 kinds of 'nc_transcript_variant' label due to name change in Ensembl v77
            vep_annotation['is_nc'] = "nc_transcript_variant" in csq_values or "non_coding_transcript_variant" in csq_values

            variant_consequence_strings = vep_annotation["consequence"].split("&")
            vep_annotation["consequence"] = get_worst_vep_annotation(variant_consequence_strings)
            vep_annotations.append(vep_annotation)

        vcf_fields = [vcf_row.CHROM, vcf_row.POS, vcf_row.ID, vcf_row.REF, ",".join(map(str, vcf_row.ALT))]
        variant_objects = vcf_stuff.get_variants_from_vcf_fields(vcf_fields)
        for variant_obj in variant_objects:
            yield variant_obj, vep_annotations
def load_data(file_name):
    vcf_reader = vcf.VCFReader(open(file_name, 'rb'))
    types = []
    records = [list(record) for record in vcf_reader]
    for item in records:
         types.append(item[7]['TYPE'])
    return types
Пример #17
0
def parse_vcf(vcf_file_name):
    import vcf
    import numpy as np
    vcf_reader = vcf.VCFReader(open(vcf_file_name, 'rb'))
    vcf_sample_matrix = []
    i = 0
    samples_list = []
    for record in vcf_reader:
        #print i
        i += 1
        #print record
        import os
        samples = [record.POS]
        for call in record.samples:
            #print call, call.gt_type
            #print call.called, call.gt_type, call.gt_bases
            if i == 1:
                samples_list.append(call.sample)
            if call.gt_type == None:

                samples.append(100)
            else:
                samples.append(call.gt_type)
        #print samples
        #os.quit()
        vcf_sample_matrix.append(samples)
    return (np.asarray(vcf_sample_matrix, dtype=np.float32), samples_list)
Пример #18
0
def juliet_json_to_vcf(json_filename,
                       vcf_filename,
                       gene_pos_info,
                       ref_name='NC_045512v2',
                       sample_name='UnknownSample'):

    with open('template.vcf', 'w') as f:
        f.write(__VCF_EXAMPLE__ + '\n')
    reader = vcf.VCFReader(open('template.vcf'))
    reader.samples = [sample_name]

    f_vcf = vcf.Writer(open(vcf_filename, 'w'), reader)

    h = open(json_filename)
    data = json.load(h)

    for g in data['genes']:
        for v in g['variant_positions']:
            cov = v['coverage']
            ref_codon = v['ref_codon']
            abs_pos = gene_pos_info[g['name']] + 3 * (v['ref_position'] - 1)
            #flag_is_primer = has_pos_overlap(abs_pos, primer_regions)
            ind = 0
            for vac in v['variant_amino_acids']:
                for cur in vac['variant_codons']:
                    ind += 1
                    _id = "{g}.{r}.{ind}".format(g=g['name'],
                                                 r=v['ref_position'],
                                                 ind=ind)
                    codon = cur['codon']
                    codon_offset = 0

                    for codon_offset in range(3):
                        if ref_codon[codon_offset] != codon[codon_offset]:
                            freq = "{0:.6f}".format(cur['frequency'])
                            #pdb.set_trace()
                            rec = vcf.model._Record(
                                CHROM=ref_name,
                                POS=abs_pos + codon_offset,
                                ID=_id,
                                REF=ref_codon[codon_offset],
                                ALT=[
                                    vcf.model._Substitution(
                                        codon[codon_offset])
                                ],
                                QUAL='.',
                                FILTER='PASS',
                                INFO={
                                    'AF': freq,
                                    'DP': cov
                                },
                                FORMAT="GT",
                                sample_indexes=None)
                            samp_ft = vcf.model.make_calldata_tuple(['GT'])
                            rec.samples.append(
                                vcf.model._Call(rec, sample_name,
                                                samp_ft(*["0|1"])))
                            f_vcf.write_record(rec)
    f_vcf.close()
Пример #19
0
    def replaceFunc(fn, sourcevcfpaths, sourceext, outpath, outext):
        for i in range(len(sourcevcfpaths)):
            if sourcevcfpaths[i][-1] != '/':
                sourcevcfpaths[i] = sourcevcfpaths[i] + '/'
        sourcename = map(lambda (l): l.strip('/').split('/')[-1],
                         sourcevcfpaths)

        sourcedata = {}
        sample = getSampleName(fn)
        for i in range(len(sourcevcfpaths)):
            vcffile = vcf.VCFReader(
                open(sourcevcfpaths[i] + sample + sourceext))
            sourcedata[sourcename[i]] = []
            for r in vcffile:
                sourcedata[sourcename[i]].append(
                    [r.CHROM, '%s' % r.POS, r.ALT,
                     '%s' % r.QUAL])
        txt = open(fn).readlines()
        header = filter(lambda (l): l[0] == '#', txt)
        txt = filter(lambda (l): l[0] != '#', txt)

        for li in range(len(txt)):
            record = txt[li].split('\t')
            found = False
            repstr = []
            for s in sourcename:
                match = filter(
                    lambda
                    (l): l[0] == record[0] and l[1] == record[1] and len(
                        set(tuple(l[2])).intersection(set(record[4].split(','))
                                                      )) > 0, sourcedata[s])
                if len(match) > 0:
                    found = True
                    A = ''
                    for fieldidx in range(5):
                        A = A + record[fieldidx] + '\t'

                    repstr.append([A + record[5], A + match[0][3]])
                    repstr.append(
                        [record[7], record[7] + ';qualSource=%s' % s])
                    break
            if not found:
                print 'Error, not found record in', fn, 'record', record
                exit()
            for A, B in repstr:
                txt[li] = txt[li].replace(A, B)

        addHeader = False
        fout = open(outpath + sample + outext, 'w')
        for l in header:
            if '##INFO=<ID=' in l and not addHeader:
                fout.write(
                    '##INFO=<ID=qualSource,Number=1,Type=String,Description="QUAL Source">\n'
                )
                addHeader = True
            fout.write('%s' % l)
        for l in txt:
            fout.write('%s' % l)
        fout.close()
Пример #20
0
    def run(self):
        with self.input().open() as f:
            variants = list(vcf.VCFReader(f))

        maf['rna_af'] = maf['genome_change'].map(lambda gc: get_af(gc, variants))

        with self.output().open('w') as f:
            maf.to_csv(f, index=False)
Пример #21
0
def load_project_variants_from_vcf(project_id,
                                   vcf_files,
                                   mark_as_loaded=True,
                                   start_from_chrom=None,
                                   end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already
    
    Args:
       project_id: the project id as a string
       vcf_files: a list of one or more vcf file paths
    """
    print("Called load_project_variants_from_vcf on " + str(vcf_files))
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_file in vcf_files:
        if not os.path.isfile(vcf_file):
            print("Skipping " + vcf_file)
            continue
        r = vcf.VCFReader(filename=vcf_file)
        if "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_file)

        if vcf_file in vcf_files:
            mall.get_annotator().add_preannotated_vcf_file(
                vcf_file,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    print("project.families_by_vcf(): " + str(project.families_by_vcf()))
    for vcf_file, families in project.families_by_vcf().items():
        if vcf_file not in vcf_files:
            print("Skipping %(vcf_file)s since its not in %(vcf_files)s" %
                  locals())
            continue

        #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        print("Loading families for VCF file: " + vcf_file)
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                mark_as_loaded=mark_as_loaded,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)
            print(
                date.strftime(
                    datetime.now(),
                    "%m/%d/%Y %H:%M:%S  -- finished loading project: " +
                    project_id))
Пример #22
0
def load_project_variants(project_id,
                          force_load_annotations=False,
                          force_load_variants=False,
                          ignore_csq_in_vcf=False,
                          start_from_chrom=None,
                          end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()):
        if not os.path.isfile(vcf_obj.path()):
            print("Skipping " + vcf_obj.path())
            continue

        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " +
                             vcf_obj.path())

        mall.get_annotator().add_preannotated_vcf_file(
            vcf_obj.path(),
            force=force_load_annotations,
            start_from_chrom=start_from_chrom,
            end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        if not force_load_variants:
            # filter out families that have already finished loading
            families = [
                f for f in families
                if get_mall(project.project_id).variant_store.
                get_family_status(project_id, f.family_id) != 'loaded'
            ]

        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(
                date.strftime(
                    datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
                    project_id + " - families batch %d - %d families" %
                    (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)

    # now load cohorts
    load_cohorts(project_id)
def eval_isophase(isophase_vcf, genome_snp, good_positions, cov_at_pos, repeat_by_chrom, shortread_cov, writer_f, name='NA', strand='NA'):
    for r in vcf.VCFReader(open(isophase_vcf)):
        out = {'dir': name,
               'chrom': 'NA',
               'pos': r.POS,
               'strand': strand,
               'ref': r.REF,
               'alt_Short': 'NA',
               'alt_PB': 'NA',
               'in_Short': 'NA',
               'in_PB': 'NA',
               'cov_Short': 'NA',
               'cov_PB': 'NA',
               'genomic_HP': 'NA'}

        r.CHROM = r.CHROM.split('|')[0]
        out['chrom'] = r.CHROM
        out['alt_PB'] = r.ALT[0]

        out['genomic_HP'] = 'Y' if (r.CHROM in repeat_by_chrom and len(repeat_by_chrom[r.CHROM].find(r.POS,r.POS))>0) else 'N'
        try:
            out['cov_Short'] = shortread_cov[r.CHROM][r.POS]
        except KeyError:
            out['cov_Short'] = 0
        out['cov_PB'] = cov_at_pos[r.CHROM,r.POS-1]
        if (r.CHROM, r.POS) not in good_positions:
            out['alt_Short'] = 'NA'
            out['in_Short'] = 'N'
            out['in_PB'] = 'Y'
        else:
            out['alt_Short'] = genome_snp[r.CHROM][r.POS].ALT[0]
            out['in_Short'] = 'Y'
            out['in_PB'] = 'Y'
            good_positions.remove((r.CHROM, r.POS))
        writer_f.writerow(out)

    # now we write out everything that is only in Shortread
    for chrom, pos in good_positions:
        out = {'dir': name,
               'chrom': chrom,
               'pos': pos,
               'strand': strand,
               'ref': genome_snp[chrom][pos].REF,
               'alt_Short': genome_snp[chrom][pos].ALT[0],
               'alt_PB': 'NA',
               'in_Short': 'Y',
               'in_PB': 'N',
               'cov_Short': 'NA',
               'cov_PB': cov_at_pos[chrom,pos-1],
               'genomic_HP': 'Y' if (chrom in repeat_by_chrom and len(repeat_by_chrom[chrom].find(pos,pos))>0) else 'N'
               }
        try:
            out['cov_Short'] = shortread_cov[chrom][pos]
        except KeyError:
            out['cov_Short'] = 0
        writer_f.writerow(out)
Пример #24
0
def parse_vcf_get_pairwise_n_snp(vcf_file_name, core_list=False):
    import vcf
    print core_list
    import numpy as np
    vcf_reader = vcf.VCFReader(open(vcf_file_name, 'rb'))
    vcf_sample_matrix = []
    i = 0
    samples_list = []
    position2index = {}
    nsnp = {}
    for record in vcf_reader:
        print i, type(record.POS), record
        i += 1
        position2index[record.POS] = record.ID
        samples = [record.POS]
        for call in record.samples:

            #print call, call.gt_type
            #print call.called, call.gt_type, call.gt_bases

            if i == 1:
                samples_list.append(call.sample)
                nsnp[call.sample] = {}
                nsnp[call.sample]["identical"] = 0
                nsnp[call.sample]["diff"] = 0
                nsnp[call.sample]["NA"] = 0
                nsnp[call.sample]["snp"] = []
            if core_list:
                if call.gt_type == None and record.POS in core_list:
                    samples.append(100)
                    nsnp[call.sample]["NA"] += 1
                elif call.gt_type == 0 and record.POS in core_list:
                    nsnp[call.sample]["identical"] += 1
                elif record.POS in core_list:
                    nsnp[call.sample]["diff"] += 1
                    nsnp[call.sample]["snp"].append(record.POS)
                else:
                    continue
            else:
                if call.gt_type == None:
                    samples.append(100)
                    nsnp[call.sample]["NA"] += 1
                elif call.gt_type == 0:
                    nsnp[call.sample]["identical"] += 1
                else:
                    nsnp[call.sample]["diff"] += 1
                    nsnp[call.sample]["snp"].append(record.POS)

                #import sys
        #print samples
        #os.quit()
        vcf_sample_matrix.append(samples)
    for i in nsnp:
        print i, "NA:", nsnp[i]["NA"], "idem:", nsnp[i][
            "identical"], "diff:", nsnp[i]["diff"], "TOTAL:", nsnp[i][
                "NA"] + nsnp[i]["identical"] + nsnp[i]["diff"]
def vcf_to_stats(in_handle, target):
    d = collections.defaultdict(list)
    for rec in vcf.VCFReader(in_handle):
        data = rec.samples[0].data
        d["target"] = target
        d["DP"].append(data.DP)
        d["QUAL"].append(rec.QUAL)
        d["GT"].append(data.GT)
        d["AD"].append(percent_ad_deviation(data))
        d["QR_QA"].append(strand_bias(data))
    return pd.DataFrame(d)
Пример #26
0
 def load_gvcf_allele_dict(self, sample_name_fn=lambda x: x):
     with open(self.gvcf) as h:
         variantes = list(tqdm(vcf.VCFReader(h)))
     for v in tqdm(variantes):
         for sample in v.samples:
             if sample.data.GT != ".":
                 self.sample_variants[sample_name_fn(sample.sample)].append(
                     str(v.POS) + ":" + str(v.ALT[int(sample.data.GT) - 1]))
     self.sample_variants = dict(self.sample_variants)
     for x in self.sample_variants:
         self.sample_variants[x] = set(self.sample_variants[x])
Пример #27
0
def __main__():

    #check arguments
    if len(sys.argv) != 8:
        print(len(sys.argv))
        print(
            'python make_snps_files.py [gff file] [vcf file] [scaf_number] [distance] [out directory] [all/noncoding]'
        )
        sys.exit()

    # read in the gff
    parseOut = gff_parse.gff_parse(sys.argv[1], int(sys.argv[4]))
    gffList = parseOut['gffList']
    geneDict = parseOut['geneDict']

    #read in the annotation file
    if sys.argv[6] == 'noncoding':
        annot = open(sys.argv[7], 'r')
        annotDic = makeAnnotDic(annot)

    # read in the vcf
    vcf_reader = vcf.VCFReader(open(sys.argv[2], 'r'))
    nameList = vcf_reader.samples
    for record in vcf_reader:

        entry = vcf_parse.vcfParse(record, nameDict)
        # see if it's in a gene!
        if len(gffList[entry['pos']]) == 0:
            continue
        # is it in a coding site and are we making noncoding?
        elif sys.argv[6] == 'noncoding':
            if annotDic[entry['pos']] == 1:  #make it an integer
                continue
        #add the snp to each gene
        for gene in gffList[entry['pos']]:
            geneDict[gene].append(entry)
    #print(geneDict['20911598'][0])

    # write out.
    for gene in geneDict:
        out = open(sys.argv[5] + gene + ".scaf" + str(sys.argv[3]) + ".snps",
                   "w")

        #write out first line of individual names
        out.write(gene)
        for ind in nameList:
            out.write("	" + str(ind))

        #write out snp genotypes
        for snp in geneDict[gene]:
            out.write("\n" + str(snp['pos']))
            #print(geneDict[gene])
            for ind in nameList:
                out.write("	" + str(snp["genotypes"][ind]))
Пример #28
0
def _23andme_exome(path):
    if vcf is None:
        raise RuntimeError("PyVCF not available, please 'easy_install' it.")

    for r in vcf.VCFReader(open(path, "r")):
        if not r.is_snp:
            continue  # XXX Is it even possible?

        for sample in r.samples:
            yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS,
                      genotype=sample.gt_bases.replace("/", ""))
def read_vcf_metrics(in_handle,
                     metrics,
                     format_metrics,
                     target,
                     use_subset=False):
    d = {
        "target": [],
        "indel": [],
        "zygosity": [],
        "QUAL": [],
        "AD": [],
        "PL": []
    }
    zygosity_map = {
        "0/0": 0,
        "0/1": 1,
        "0|1": 1,
        "1/1": 2,
        "2/1": 2,
        "1/2": 2,
        "0": 0,
        "1": 2
    }
    for x in metrics + format_metrics:
        d[x] = []
    if use_subset:
        recs = itertools.islice(vcf.VCFReader(in_handle), 10000)
    else:
        recs = vcf.VCFReader(in_handle)
    for rec in recs:
        d["zygosity"].append(zygosity_map[rec.samples[0].data.GT])
        for x in metrics:
            d[x].append(rec.INFO.get(x, None))
        d["target"] = target
        d["AD"].append(_calc_ad(rec.samples[0].data))
        d["PL"].append(_calc_pl(rec.samples[0].data))
        format_dp = _calc_dp(rec.samples[0].data)
        d["DP"].append(format_dp)
        d["QUAL"].append(rec.QUAL)
        d["indel"].append(int(rec.is_indel))
    return pandas.DataFrame(d)
Пример #30
0
def main():
    """main function
    """
    
    vcf_fh = dict()
    #vcf_files = dict()

    parser = cmdline_parser()
    args = parser.parse_args()
        
    for (k, v) in [
            ('FN', args.vcf_fn),
            ('normal_rlx', args.vcf_nrlx),
            ('normal_str', args.vcf_nstr),
            ('tumor_rlx', args.vcf_trlx),
            ('tumor_str', args.vcf_tstr),
            ('somatic_raw', args.vcf_sraw),
            ('somatic_final', args.vcf_sfinal),
            ('somatic_final_minus_dbsnp', args.vcf_sfinal_wo_dbsnp)]:
        #vcf_files[k] = v
        try:
            vcf_fh[k] = vcf.VCFReader(filename=v)
        except:
            sys.stderr.write("Reading %s failed\n" % v)
            raise
    
    sys.stderr.write("Analyzing FN %s and friends\n" % vcf_fh['FN'].filename)
    
    ORDER = ['normal_rlx', 'normal_str', 'tumor_rlx', 'tumor_str', 'somatic_raw', 'somatic_final', 'somatic_final_minus_dbsnp']
    
    
    print("#CHROM\tPOS\tREF\tALT\t%s" % ('\t'.join(ORDER)))
    for fn in vcf_fh['FN']:
        present_in = dict()
        for k in ORDER:
            present_in[k] = 0
            for t in vcf_fh[k].fetch(fn.CHROM, fn.POS-1, fn.POS):
                assert len(fn.REF) == len(t.REF)
                assert len(fn.ALT)==1
                assert len(t.ALT)==1            
                if t.ALT[0] == fn.ALT[0]:
                    if t.QUAL:
                        q = t.QUAL
                    else:
                        q = "."
                    try:
                        present_in[k] = "Q=%s;SB=%s;DP=%d;AF=%f" % (q, t.INFO['SB'], t.INFO['DP'], t.INFO['AF'])
                    except KeyError:
                        sys.stderr.write("Key Error. Dropping to debugger\n")
                        import pdb; pdb.set_trace()
                    break
        print("%s\t%s\t%s\t%s\t%s" % (
            fn.CHROM, fn.POS, fn.REF, fn.ALT[0], '\t'.join(["%s" % present_in[k] for k in ORDER])))