예제 #1
0
    def test_illumina_writer(self):
        # 01234
        # 1234567890
        # CCTGATTT-A
        # TAACGA
        #   -  C -A
        vcf = '''##fileformat=VCFv4.1
#CHROM POS ID REF ALT QUAL FILTER INFO
ref 1 . C T 10 PASS .
ref 2 . CT CA,C 10 PASS .
ref 3 . T A 10 PASS .
ref 4 . G C 10 PASS .
ref 5 . A G 10 PASS .
ref 6 . T A,C 10 PASS .
ref 7 . TT T 10 PASS .
ref 8 . T TA 10 PASS .
ref 10 . A C 10 PASS .
'''

        vcf = vcf.replace(' ', '\t')
        vcf_fhand = NamedTemporaryFile(suffix='.vcf')
        vcf_fhand.write(vcf)
        vcf_fhand.flush()
        vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz')
        compress_with_bgzip(vcf_fhand, vcf_compressed)
        index_vcf_with_tabix(vcf_compressed.name)

        ref_fhand = NamedTemporaryFile(suffix='.fasta')
        ref_fhand.write('>ref\nACTGATTTA\n')
        ref_fhand.flush()

        out_fhand1 = StringIO()

        writer = IlluminaWriter(ref_fhand.name, out_fhand1, min_length=0,
                                vcf_fpath=vcf_compressed.name)

        for snp in Reader(filename=vcf_compressed.name):
            writer.write(snp)

        # With no SNPs converted to IUPAC around
        out_fhand2 = StringIO()
        writer = IlluminaWriter(ref_fhand.name, out_fhand2, min_length=0)
        for snp in Reader(filename=vcf_compressed.name):
            writer.write(snp)

        remove(vcf_compressed.name + '.tbi')
        expected = u'CHROM\tPOS\tID\tseq\n'
        expected += u'ref\t1\t.\t[C/T]*WSRHT-^A\n'
        expected += u'ref\t2\t.\tY[CT/CA/C]SRHT-^A\n'
        expected += u'ref\t3\t.\tYC[T/A]SRHT-^A\n'
        expected += u'ref\t4\t.\tY*W[G/C]RHT-^A\n'
        expected += u'ref\t5\t.\tY*WS[A/G]HT-^A\n'
        expected += u'ref\t6\t.\tY*WSR[T/A/C]T-^A\n'
        expected += u'ref\t7\t.\tY*WSRH[TT/T]A\n'
        expected += u'ref\t8\t.\tY*WSRHT[T/TA]A\n'
        expected += u'ref\t10\t.\tY*WSRHT-^A[A/C]\n'
        assert expected == out_fhand1.getvalue()

        expected = u'CHROM\tPOS\tID\tseq\nref\t1\t.\t[C/T]'
        assert expected in out_fhand1.getvalue()
    def assertVcfHasVariantWithCall(self, vcf, chrom, pos, sample, call):
        """
        Assert that a call is made for a given sample in a given position. `call` is a dict corresponding to elements
        in the vcf sample field. Example:

        self.assertVcfHasVariantWithCall(my_vcf, 1, 3184885, 'B',
                                         call={'GT': '1/2', 'DP': 10})
        """
        self.assertVcfHasSample(vcf, sample)

        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and variant.POS == pos:
                for cc in variant.samples:
                    if cc.sample == sample:
                        # thank you http://stackoverflow.com/a/4527978/179444
                        shared_items = set(cc.data.__dict__.items()) & set(
                            call.items())
                        if shared_items == set(call.items()):
                            variant_found = True

        if not variant_found:
            raise AssertionError(
                "Call {} not present for sample {} at {}:{} in {}".format(
                    call, sample, chrom, pos, vcf))
    def assertVcfHasVariantAt(self, vcf, chrom, pos):
        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and variant.POS == pos:
                variant_found = True

        if not variant_found:
            raise AssertionError("Variant at {}:{} not present in {}".format(
                chrom, pos, vcf))
예제 #4
0
def parse_vcf(assembly,
              vcf_infile,
              compressed=True,
              verbose=True,
              by_id=True,
              **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)  # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(assembly, rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[assembly] = doc[assembly][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            logging.info("%s\t%s" %
                                         (_doc['rsid'], _doc['_id']))

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        cnt_1 += 1
    logging.info("Done. [{}]".format(timesofar(t0)))
    logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(
        cnt_1, cnt_2, cnt_3))
    def assertVcfHasVariantWithChromPosRefAlt(self, vcf, chrom, pos, ref, alt):
        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and \
                    variant.POS == pos and \
                    variant.REF == ref and \
                    alt in variant.ALT:
                variant_found = True

        if not variant_found:
            raise AssertionError(
                "Variant at {}:{} {}/{} not present in {}".format(
                    chrom, pos, ref, alt, vcf))
예제 #6
0
    def __init__(self, filename, convert_empty=True, **kwargs):
        """Initialize VCFFile instance.

        Parameters
        ----------
        filename : required for pandas.DataFrame loading
        convert_empty : specify whether to convert
            VCF file empty values to NaN
        """
        Reader.__init__(self, filename=filename, **kwargs)
        self._init_df(filename)
        if convert_empty:
            self._convert_empty_to_nan()
            self.is_nan_converted = True
        else:
            self.is_nan_converted = False
예제 #7
0
    def __init__(self,  ref_fpath, out_fhand, length=60, vcf_fpath=None,
                 min_length=None):
        ''''It inits.

        The vcf will be used to replace in the reference sequence the SNPs
        around the SNP of interest with IUPAC codes
        '''
        self._sep = u'\t'
        self._len = length
        if min_length is None:
            min_length = length
        if min_length > length:
            msg = 'Minimum length must be smaller than required length'
            raise ValueError(msg)
        self._min_len = min_length

        self._ref_seqs = SeqIO.index(ref_fpath, format='fasta')

        if vcf_fpath:
            self._snvs = Reader(filename=vcf_fpath)
        else:
            self._snvs = None
        self._out_fhand = out_fhand
        out_fhand.write(u'CHROM\tPOS\tID\tseq\n')
        self._prev_chrom = None
예제 #8
0
 def test_vcf2sfs(self):
     vcf_file = Reader(filename=self.filename, compressed=True, encoding='utf-8')
     panel = read_csv(self.panelname, sep=None, engine='python', skipinitialspace=True, index_col=0)
     panel = panel[panel['pop'] == 'YRI']
     result = vcf2sfs(vcf_file, panel, self.chrom, self.start, self.end, select_chr=True)
     assert all(self.result[0] == result[0]), "Failed test of vcf2sfs (sfs)"
     assert all(self.result[1] == result[1]), "Failed test of vcf2sfs (sample size)"
     assert all(self.result[2] == result[2]), "Failed test of vcf2sfs (common variant)"
예제 #9
0
def get_haplotype_stats(template_vcf: vcf.Reader, in_vcf: vcf.Reader, out):
    contigs = in_vcf.contigs.keys()
    hap_stats = HapStats()
    for contig in contigs:
        try:
            template_vcf.fetch(contig)
            template_chromo = ChromosomoHaplotype(template_vcf, contig)
            in_chromo = ChromosomoHaplotype(in_vcf, contig)
            chromo_hap_stats = get_haplotype_stats_chromo(
                template_chromo, in_chromo, out, contig)
            hap_stats.insert_hap_stats(chromo_hap_stats)
        except:
            continue
    out.write("%s\t%d\t%d\t%d\t%d\t%.8f\t%.8f\n" %
              ("total", hap_stats.get_AN50(), hap_stats.get_N50(),
               hap_stats.get_total_phased(), hap_stats.get_total_spanned(),
               hap_stats.get_switch_error(), hap_stats.get_mismatch_error()))
예제 #10
0
    def test_calculate_statistics(self):
        # with freebayes
        reader = Reader(filename=FREEBAYES_VCF_PATH)
        vcf_to_compare = VCFcomparisons(FREEBAYES_VCF_PATH)
        stats = vcf_to_compare.calculate_statistics(reader)
        assert stats['common'] == 944
        assert stats['uncalled'] == 0
        assert stats['different'] == 0
        assert stats['common_snps_prc'] == 100

        # with varscan
        reader = Reader(filename=VARSCAN_VCF_PATH)
        vcf_to_compare = VCFcomparisons(VARSCAN_VCF_PATH, samples=['mu16'])
        stats = vcf_to_compare.calculate_statistics(reader, samples=['mu16'])
        assert stats['common'] == 107
        assert stats['uncalled'] == 69
        assert stats['different'] == 0
        assert stats['common_snps_prc'] == 100
예제 #11
0
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)   # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[POS_KEY] = doc[POS_KEY][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            print(_doc['rsid'], '\t', _doc['_id'])

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        cnt_1 += 1
    print("Done. [{}]".format(timesofar(t0)))
    print("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
예제 #12
0
def write_chromosome(in_vcf: vcf.Reader, out_vcf: vcf.Writer,
                     chromo_haplotype: ChromosomoHaplotype, contig: str):
    rec: vcf.model._Record
    for rec in in_vcf.fetch(contig):
        het = rec.samples[0].gt_type
        if het != 1:  # not het loci
            out_vcf.write_record(rec)
        else:
            record = chromo_haplotype.chromo_record[rec.POS]
            record.finalize_record(rec)
            out_vcf.write_record(rec)
예제 #13
0
    def test_errors(self):
        # 01234
        # 1234567890
        # CCTGATTT-A
        # TAACGA
        #   -  C -A
        vcf = '''##fileformat=VCFv4.1
#CHROM POS ID REF ALT QUAL FILTER INFO
ref 1 . C T 10 PASS .
ref 2 . CT CA,C 10 PASS .
ref 3 . T A 10 PASS .
ref 4 . G C 10 PASS .
ref 5 . A G 10 PASS .
ref 6 . T A,C 10 PASS .
ref 7 . TT T 10 PASS .
ref 8 . T TA 10 PASS .
ref 10 . A C 10 PASS .
'''

        vcf = vcf.replace(' ', '\t')
        vcf_fhand = NamedTemporaryFile(suffix='.vcf')
        vcf_fhand.write(vcf)
        vcf_fhand.flush()
        vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz')
        compress_with_bgzip(vcf_fhand, vcf_compressed)
        index_vcf_with_tabix(vcf_compressed.name)

        ref_fhand = NamedTemporaryFile(suffix='.fasta')
        ref_fhand.write('>ref\nACTGATTTA\n')
        ref_fhand.flush()

        out_fhand = StringIO()
        writer = IlluminaWriter(ref_fhand.name, out_fhand,
                                vcf_fpath=vcf_compressed.name)
        snps = Reader(filename=vcf_compressed.name)
        snp = snps.next()
        try:
            writer.write(snp)
            self.fail('NotEnoughAdjacentSequenceError expected')
        except IlluminaWriter.NotEnoughAdjacentSequenceError:
            pass
예제 #14
0
    def test_errors(self):
        # 01234
        # 1234567890
        # CCTGATTT-A
        # TAACGA
        #   -  C -A
        vcf = '''##fileformat=VCFv4.1
#CHROM POS ID REF ALT QUAL FILTER INFO
ref 1 . C T 10 PASS .
ref 2 . CT CA,C 10 PASS .
ref 3 . T A 10 PASS .
ref 4 . G C 10 PASS .
ref 5 . A G 10 PASS .
ref 6 . T A,C 10 PASS .
ref 7 . TT T 10 PASS .
ref 8 . T TA 10 PASS .
ref 10 . A C 10 PASS .
'''

        vcf = vcf.replace(' ', '\t')
        vcf_fhand = NamedTemporaryFile(suffix='.vcf')
        vcf_fhand.write(vcf)
        vcf_fhand.flush()
        vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz')
        compress_with_bgzip(vcf_fhand, vcf_compressed)
        index_vcf_with_tabix(vcf_compressed.name)

        ref_fhand = NamedTemporaryFile(suffix='.fasta')
        ref_fhand.write('>ref\nACTGATTTA\n')
        ref_fhand.flush()

        out_fhand = StringIO()
        writer = IlluminaWriter(ref_fhand.name, out_fhand,
                                vcf_fpath=vcf_compressed.name)
        snps = Reader(filename=vcf_compressed.name)
        snp = snps.next()
        try:
            writer.write(snp)
            self.fail('NotEnoughAdjacentSequenceError expected')
        except IlluminaWriter.NotEnoughAdjacentSequenceError:
            pass
예제 #15
0
파일: cli.py 프로젝트: rmar1478/client
def vcf_to_hgvs(build_name, input_handle, output_handle):
    """
    Convert all variants in a VCF file to HGVS.

    :arg str build_name: Build name.
    :arg stream input_handle: Open readable handle to a VCF file.
    :arg stream output_handle: Open writeable handle to a text file.
    """
    mutalyzer = Mutalyzer(build_name)

    for record in Reader(input_handle):
        for alt in record.ALT:
            output_handle.write('{}\n'.format(mutalyzer.vcf_to_hgvs(
                record.CHROM, record.POS, record.REF, alt)))
예제 #16
0
 def __init__(self):
     self.filename = projdir + '/chr1vcftest.gz'
     self.panelname = projdir + '/testpanel.panel'
     self.vcf_file = Reader(filename=projdir + '/chr1vcftest.gz', compressed=True, encoding='utf-8')
     self.panel = read_csv(projdir + '/testpanel.panel', sep=None, engine='python', skipinitialspace=True, index_col=0)
     self.chrom = '1'
     self.start = 159173097
     self.end = 159176290
     self.result = ((array([3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 108, ['rs2814778']))
     self.rho = 0.9864313429651763
예제 #17
0
파일: cli.py 프로젝트: rmar1478/client
def vcf_to_db(build_name, input_handle, output_handle):
    """
    Convert all variants in a VCF file to database format.

    :arg str build_name: Build name.
    :arg stream input_handle: Open readable handle to a VCF file.
    :arg stream output_handle: Open writeable handle to a text file.
    """
    mutalyzer = Mutalyzer(build_name)

    for record in Reader(input_handle):
        for alt in record.ALT:
            _write_db(output_handle, *mutalyzer.vcf_to_db(
                record.CHROM, record.POS, record.REF, alt))
예제 #18
0
 def _extract(cls, file_path: Union[str, Path]) -> EvaluationData:
     records = []
     vcf_reader = Reader(open(file_path, "r"))
     for vcf_record in vcf_reader:
         chrom = str(vcf_record.CHROM)
         pos = vcf_record.POS
         ref = vcf_record.REF
         alt = (vcf_record.ALT[0]
                if len(vcf_record.ALT) == 1 else vcf_record.ALT).sequence
         clnsig = PathogencityClass(vcf_record.INFO["CLNSIG"][0].lower())
         variation_type = VariationType(vcf_record.var_type)
         rg = ReferenceGenome.resolve(vcf_reader.metadata["reference"])
         records.append(
             EvaluationDataEntry(chrom, pos, ref, alt, clnsig,
                                 variation_type, rg))
     return EvaluationData.from_records(records)
예제 #19
0
def main():
    close_infile = False
    infile = sys.argv[1]
    if infile == "-":
        i = sys.stdin
    else:
        i = open(infile, "rU")
        close_infile = True
        
    close_outfile = False
    outfile = sys.argv[2]
    if outfile == "-":
        o = sys.stdout
    else:
        o = open(outfile, "w")
        close_outfile = True
        
    group_sizes = tuple(int(i) for i in sys.argv[3].split(","))
    group_cnt = range(len(group_sizes))
    group_ixs = (0,) + tuple(cumsum(group_sizes))
    
    reader = Reader(i)
    writer = Writer(o, reader)
    for rec in reader:
        fix = []

        for i in group_cnt:
            calls = rec.samples[group_ixs[i]:group_ixs[i+1]]
            called = False
            for c in calls:
                if c.called:
                    called = True
                    break
            if not called:
                fix.extend(calls)
        
        if len(fix) > 0:
            for c in fix:
                # This is a hack because PyVCF _Call objects are not mutable
                c.data = vcf.model.make_calldata_tuple(c.data._fields)(GT="0/0", DP=c.data.DP, GQ=c.data.GQ, PL=c.data.PL)
        
        writer.write_record(rec)
    
    if close_infile:
        i.close()
    if cloes_outfile:
        o.close()
def BisSNP_vcf2bed(INfile, OUTfile, mextDir, my_session, logobject):
    cmd = ['grep -v "#" ' + INfile + ' | wc -l ']
    rowNum = int(subprocess.check_output(cmd, shell=True))
    #create empty numpy array of desired size
    dtstr = ['string', 'int', 'string', 'string', 'float', 'int', 'int']
    for i in range(7, 17):
        dtstr.insert(i, 'int')
    colN = [
        'CHROM', 'POS', 'STRAND', 'REF', 'QUAL', 'MQ0', 'DP', 'DP_FREF',
        'DP_RREF', 'DP_FALT', 'DP_RALT', 'C_Cstrand', 'T_Cstrand',
        'Illegal_Cstrand', 'G_Gstrand', 'A_Gstrand', 'Illegal_Gstrand'
    ]
    dummyA = np.empty(shape=(rowNum, 17), dtype='object')
    CpG_df = pd.DataFrame(dummyA, index=range(0, rowNum), columns=colN)
    #read in vcf
    vcf_reader = Reader(open(INfile, 'r'))
    for record, i in zip(vcf_reader, range(0, rowNum)):
        CHROM = pd.Series(record.CHROM, name='CHROM')
        POS = pd.Series(record.POS, name='POS')
        #skip  ALT for CpG calls; skip FILTER
        STRAND = pd.Series(record.INFO['CS'], name='STRAND')
        REF = pd.Series(record.REF, name='REF')
        QUAL = pd.Series(record.QUAL, name='QUAL')
        MQ0 = pd.Series(record.INFO['MQ0'], name='MQ0')
        #Context=pd.Series(record.INFO['REF'],name='Context')
        #Sample=pd.Series(record.samples[0].sample,name='Sample') #don't need for single sample vcf
        GT, BQ, BRC6, CM, CP, CU, DP, DP4, GP, GQ, SS = record.samples[0].data
        #skip GT as non-homozygous CGs have been filtered out
        DP = pd.Series(DP, name='DP')
        DP4_dict = cll.OrderedDict(
            zip(['DP_FREF', 'DP_RREF', 'DP_FALT', 'DP_RALT'], DP4))
        DP4_df = pd.DataFrame.from_dict(DP4_dict, orient='index').transpose()
        BRC6_dict = cll.OrderedDict(
            zip([
                'C_Cstrand', 'T_Cstrand', 'Illegal_Cstrand', 'G_Gstrand',
                'A_Gstrand', 'Illegal_Gstrand'
            ], BRC6))
        BRC6_df = pd.DataFrame.from_dict(BRC6_dict, orient='index').transpose()
        comb_df = pd.concat(
            [CHROM, POS, STRAND, REF, QUAL, MQ0, DP, DP4_df, BRC6_df],
            axis=1,
            join='inner')
        CpG_df.iloc[i, ] = comb_df.iloc[0, ]
    CpG_df.to_csv(OUTfile, sep='\t', na_rep='NA', index=False)
    logobject.info('CpG vcf to txt conversion complete')
    return
예제 #21
0
def test_neutrality_from_vcf(vcf_name, panel_name, coord, start, end, sel,
                             reps, select_chr):
    """Calculate the log odds ratio of the data specified by PyVCF file VCF_NAME, sample details PANEL_NAME and
    region defined by CHROM, START and END."""
    vcf_file = Reader(filename=vcf_name, compressed=True, encoding='utf-8')
    panel = pd.read_csv(panel_name,
                        sep=None,
                        engine='python',
                        skipinitialspace=True,
                        index_col=0)
    if sel != (None, None):
        panel = panel[panel[sel[0]] == sel[1]]
    sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, coord, start,
                                                 end, select_chr)
    rho = selectiontest.test_neutrality(sfs,
                                        variates0=None,
                                        variates1=None,
                                        reps=reps)
    click.echo(rho)
예제 #22
0
def _grid_export_vcf(filename, genome_build, colmodels, items, sample_ids, sample_names_by_id):
    samples = [sample_names_by_id[s_id] for s_id in sample_ids]

    info_dict = _get_colmodel_info_dict(colmodels)
    vcf_template_file = _colmodels_to_vcf_header(genome_build, info_dict, samples)
    vcf_reader = Reader(vcf_template_file, strict_whitespace=True)

    pseudo_buffer = StashFile()

    vcf_writer = Writer(pseudo_buffer, vcf_reader)

    def iter_row_writer():

        for obj in items:
            record = _grid_item_to_vcf_record(info_dict, obj, sample_ids, samples)
            vcf_writer.write_record(record)
            yield pseudo_buffer.value

    response = StreamingHttpResponse(iter_row_writer(), content_type="text/csv")
    response['Content-Disposition'] = f'attachment; filename="{filename}.vcf"'
    return response
예제 #23
0
def parse(json_filename, vcf_file):
    amount = 0
    with open("errors.txt", "w"):
        pass

    vcf_reader = Reader(open(vcf_file))

    json_chr = {}

    for index, record in enumerate(vcf_reader):
        try:
            if record.INFO["AF"][
                    0] > 0.01:  # and record.INFO["non_cancer_AF_popmax"] == 0: #  or not "e" in str(x.INFO[keys[0]][0])

                amount += 1

                if json_chr.get(str(record.CHROM)):
                    json_chr[str(record.CHROM)].append(to_json(record, index))
                else:
                    json_chr[str(record.CHROM)] = [to_json(record, index)]

        # Just skip errors.
        except KeyError:
            pass

        # Testing only
        # if amount > 6:
        # break

    total_data = index + 1

    # Write the file used for MongoDB
    with open(json_filename, "w") as outjson:
        json.dump(json_chr, outjson)

    # Extra information about how much data is stored.
    with open("Output/data_split_{}.txt".format(json_filename.split("/")[1]),
              "w") as dp:
        dp.write("Amount of data: {}\nTotal data in file: {}".format(
            amount, total_data))
예제 #24
0
    def __init__(self, in_vcf: vcf.Reader, chromo: str):
        self.chromo_record = dict()
        self.chromo_phase_set = dict()
        self.chromo_record2phaseset_map = dict()
        self.graph_struct = graph.Graph()
        rec: vcf.model._Record
        ps_label_fix = dict()
        idx = 0

        for rec in in_vcf.fetch(chromo):
            het = rec.samples[0].gt_type
            if het != 1:  # not het loci
                continue
            PS_fix = 0
            if rec.samples[0].phased:
                fmt = rec.FORMAT.split(':')
                if 'PS' in fmt:
                    PS = rec.samples[0]['PS']
                    if PS in ps_label_fix.keys():
                        PS_fix = ps_label_fix[PS]
                    else:
                        ps_label_fix[PS] = rec.POS
                        PS_fix = rec.POS
                else:
                    PS_fix = 1
            record = Record()
            record.copy_from_rec(rec, PS_fix, idx)
            idx += 1
            self.chromo_record[record.pos] = record
            if record.ps != 0:
                PS = record.ps
                self.chromo_record2phaseset_map[record.pos] = PS
                phase_set: PhaseSet
                if PS in self.chromo_phase_set.keys():
                    phase_set = self.chromo_phase_set[PS]
                else:
                    phase_set = PhaseSet(record.ps)
                    self.chromo_phase_set[PS] = phase_set
                phase_set.insert_record(record)
예제 #25
0
import sys
from vcf import Reader
import gzip

vcf = Reader(open(sys.argv[1], 'r'))
n = 0

for v in vcf:
    if len(v.ALT) > 1: continue
    if v.QUAL < 20: continue
    if v.aaf[0] > 0.05: continue
    n += 1
print(n)
예제 #26
0
def overwrite_reader_samples(vcf_reader: vcf.Reader, samples):
    vcf_reader.samples = samples
    vcf_reader._sample_indexes = dict([(x,i) for (i,x) in enumerate(vcf_reader.samples)])
예제 #27
0
class IlluminaWriter(object):
    '''It writes the SNPs in Illumina format

    ref_fpath should be in fasta format and it has to have a name attribute.
    min_maf controls the SNPs reported in the adjacent segments as IUPAC codes.
    '''

    # TODO add extra error classes
    # TODO include the error classes inside this class to easy access
    class NotEnoughAdjacentSequenceError(Exception):
        pass

    def __init__(self,  ref_fpath, out_fhand, length=60, vcf_fpath=None,
                 min_length=None):
        ''''It inits.

        The vcf will be used to replace in the reference sequence the SNPs
        around the SNP of interest with IUPAC codes
        '''
        self._sep = u'\t'
        self._len = length
        if min_length is None:
            min_length = length
        if min_length > length:
            msg = 'Minimum length must be smaller than required length'
            raise ValueError(msg)
        self._min_len = min_length

        self._ref_seqs = SeqIO.index(ref_fpath, format='fasta')

        if vcf_fpath:
            self._snvs = Reader(filename=vcf_fpath)
        else:
            self._snvs = None
        self._out_fhand = out_fhand
        out_fhand.write(u'CHROM\tPOS\tID\tseq\n')
        self._prev_chrom = None

    def write(self, snv):
        chrom_name = snv.CHROM

        prev_chrom = self._prev_chrom
        if prev_chrom is None or prev_chrom.name != chrom_name:
            chrom = self._ref_seqs[chrom_name]
            self._prev_chrom = chrom
        else:
            chrom = prev_chrom

        length = self._len
        min_len = self._min_len

        snv_start = snv.start   # 0 based
        snv_end = snv.end       # 1 based
        desired_start = snv_start - length  # desired segment start
        end = snv_end + length      # desired segment end
        chrom_seq = chrom.seq
        first_segment = unicode(chrom_seq[desired_start:snv_start])

        if len(first_segment) < min_len:
            msg = "Not enough sequence in 3'. ID: %s, POS: %d, CHROM: %s"
            msg %= (snv.ID, snv.POS, snv.CHROM)
            raise self.NotEnoughAdjacentSequenceError(msg)

        if self._snvs:
            real_start = snv_start - len(first_segment)
            close_snvs = self._snvs.fetch(chrom.name, start=real_start,
                                          end=snv_start)
            first_segment = _replace_snvs_with_iupac(first_segment, close_snvs,
                                                     seq_offset=real_start)

        snv_segment = _build_snv_section(snv)
        second_segment = unicode(chrom_seq[snv_end:end])
        if len(second_segment) < min_len:
            msg = "Not enough sequence in 5'. ID: %s, POS: %d, CHROM: %s"
            msg %= (snv.ID, snv.POS, snv.CHROM)
            raise self.NotEnoughAdjacentSequenceError(msg)

        if self._snvs:
            real_end = snv_end + len(second_segment)
            close_snvs = self._snvs.fetch(chrom.name, start=snv_end,
                                          end=real_end)
            second_segment = _replace_snvs_with_iupac(second_segment,
                                                      close_snvs,
                                                      seq_offset=snv_end)

        out_fhand = self._out_fhand
        sep = self._sep
        out_fhand.write(unicode(snv.CHROM))
        out_fhand.write(sep)
        out_fhand.write(unicode(snv.POS))
        out_fhand.write(sep)
        snp_id = snv.ID
        if snp_id is None:
            snp_id = u'.'
        out_fhand.write(snp_id)
        out_fhand.write(sep)
        out_fhand.write(first_segment)
        out_fhand.write(snv_segment)
        out_fhand.write(second_segment)
        out_fhand.write(u'\n')

    def flush(self):
        self._out_fhand.flush()

    def close(self):
        self._out_fhand.close()
예제 #28
0
 def assertVcfHasSample(self, vcf, sample):
     v = Reader(filename=vcf)
     if sample not in v.samples:
         raise AssertionError("Sample {} not present in {}".format(
             sample, vcf))
예제 #29
0
def records_from_vcf(vcf_file: str) -> list:
    """Creates a list of VCF Record objects from a VCF file"""

    # Load the VCF
    vcf_reader: Reader = Reader(open(vcf_file, 'r'))

    # Create the list of records
    records: list = []
    total: int = 0
    n_failed: int = 0
    n_with_too_many_unknown_genotypes: int = 0
    n_below_threshold: int = 0
    data_set_ploidity: None = None

    while True:
        try:
            total += 1

            r = next(vcf_reader)
            chromosome: str = r.CHROM
            position: int = r.POS
            ref_allele: str = r.REF
            alt_alleles: list = r.ALT

            if data_set_ploidity is None:
                data_set_ploidity: int = r.samples[0].ploidity
                assert data_set_ploidity == 1 or data_set_ploidity == 2

            genotypes, n_alternates = get_genotypes(
                data_set_ploidity=data_set_ploidity, r=r, ref_allele=ref_allele, alt_alleles=alt_alleles
            )

            assert len(genotypes) == len(r.samples)

            # Only add records if the ratio of alternate alleles to total alleles exceeds the (MAF) filter threshold
            if n_alternates / len(r.samples) >= MAF_THRESHOLD:
                records.append(VCFRecordObj(chromosome=chromosome, position=position, genotypes=genotypes))
            else:
                n_below_threshold += 1
        except RuntimeError:
            # There was a missing genotype in one of the samples of the current record
            n_with_too_many_unknown_genotypes += 1
            continue
        except ValueError:
            # The current record failed because the call to next above raised and exception
            n_failed += 1
            continue
        except StopIteration:
            # Decrement the total number of records since the latest increment occurred after the last iteration
            total -= 1
            break

    assert len(records) + n_with_too_many_unknown_genotypes + n_below_threshold + n_failed == total

    print('For {}:'.format(vcf_file, MAF_THRESHOLD))
    print('Minor Allele Frequency Threshold: {0:.2f}'.format(MAF_THRESHOLD))

    # Print the percentages of the records that will actually be used, the records with missing genotypes, the records
    # that didn't have enough alternate alleles, and the records that flat out failed
    print_percentage(msg='records successfully added', amount=len(records), total=total)
    print_percentage(
        msg='records with too many missing genotypes', amount=n_with_too_many_unknown_genotypes, total=total
    )
    print_percentage(msg='records below the minor allele frequency threshold', amount=n_below_threshold, total=total)
    print_percentage(msg='records that failed to parse', amount=n_failed, total=total)

    print('Total number of records:', total)

    return records
#! /usr/bin/python

from vcf import Reader
from sys import argv

path=argv[1]
reader=Reader(filename=path)

print('CHROM\tPOS\tREF\tALT\tSAMPLE\tINDEL\tHW\tMASKED\tINFORM\tREP\tA_INDEL\tA_SNP\tANNO')
calls=len(reader.samples)
output.write('calls:'+str(calls)+'\n')

for record in reader:
    info={'CHROM':record.CHROM, 'POS':record.POS, 'REF':record.REF, 'ALT':record.ALT, 'SAMPLE':[], 'INDEL':'', 'HW':'', 'MASKED':'', 'INFORM':'', 'REP':'', 'A_INDEL':'', 'A_SNP':'', 'ANNO':''}
    for sample in record.samples:
        if len(sample.data)>1:
            info['SAMPLE'].append((sample.sample,sample.data[0],sample.data[1]))
        else:
            info['SAMPLE'].append((sample.sample,sample.data[0],None))
    print(str(info['CHROM'])+'\t'+str(info['POS'])+'\t'+str(info['REF'])+'\t'+str(info['ALT'])+'\t'+str(info['SAMPLE'])+'\t'+str(info['INDEL'])+'\t'+str(info['HW'])+'\t'+str(info['MASKED'])+'\t'+str(info['INFORM'])+'\t'+str(info['REP'])+'\t'+str(info['A_INDEL'])+'\t'+str(info['A_SNP'])+'\t'+info['ANNO'])