def __init__(self, vgz, chm=None, bp0=None, bp1=None, wnd=1024, dsg='012', dtp=int): """ vgz: name of the bgzipped VCF file (*.vcf.gz), if None, only the reference genome will be returned. chm: chromosome to be sampled, zero based. bp0: initial position in the chromosome, zero based. wnd: sample window size, (defaut=1024). dsg: encoding for alternative allele count 0, 1, and 2; use '012' for additive, '022' for dominative, and '002' for recessive encoding. The default scheme is addtive, '012'. """ self.VGZ = vgz # find the first locus in the VCF file gv1 = next(vcfR(filename=vgz)) # VGZ reader point to the *.vcf.gz vgz = vcfR(filename=vgz) # the chromosome and its length chm = gv1.CHROM if chm is None else CKY[chm] cln = vgz.contigs[chm].length self.CHR = int(chm) # starting position if not bp0: bp0 = gv1.start elif bp0 < 0: bp0 = gv1.start + bp0 self.BP0 = bp0 # ending position if not bp1: bp1 = cln elif bp1 < 0: bp1 = cln + bp1 self.BP1 = bp1 # private members self.__vgz__ = vgz self.__gvr__ = gv1 self.__chm__ = chm self.__bp0__ = bp0 self.__bp1__ = bp1 self.__wnd__ = wnd self.__dsg__ = dsg self.__dtp__ = dtp self.__mem__ = [] self.__pos__ = [] self.__vid__ = []
def __init__(self, vgz, chm=None, bp0=None, bp1=None, sbj=None): """ vgz: bgzipped VCF file, with tabix index. fsq: FASTQ file storing the reference genome chm: the chromosome bp0: the starting basepair, 0 based, inclusive bp1: the ending basepair, 0 based, exclusive """ # record the parameters self.VGZ = vgz # find the first locus in the VCF file gv1 = next(vcfR(filename=vgz)) # VGZ reader point to the *.vcf.gz vgz = vcfR(filename=vgz) # the chromosome and its length chm = gv1.CHROM if chm is None else CKY[chm] cln = vgz.contigs[chm].length # starting position if not bp0: bp0 = gv1.start elif bp0 < 0: bp0 = gv1.start + bp0 # ending position if not bp1: bp1 = cln elif bp1 < 0: bp1 = cln + bp1 # restrict VCF range vgz = vgz.fetch(chm, bp0, bp1) # find the first locus in the region gvr = None while gvr is None: try: gvr = next(vgz) except StopIteration: break if gvr.start < bp0: gvr = None # private members self.__pos__ = bp0 # the pointer self.__gvr__ = gvr # the variant self.__vgz__ = vgz # vcf reader self.__bp0__ = bp0 # starting position self.__bp1__ = bp1 # ending position
def loadVCF(vcf): """ Read genomic VCF file by name. Fix MAF and allele orders. return genomic matrix and subject IDs. The function loads the entire VCF into memory, so it is not recommended for huge VCF. """ # Python VCF reader: pip install pyvcf from vcf import Reader as vcfR # the two homogenuous chromosomes A, B = [], [] reader = vcfR(filename=vcf) sbj = reader.samples # subject IDs for v in reader: # copy #1 and #2 a = [int(g.gt_alleles[0] > '0') for g in v.samples] b = [int(g.gt_alleles[1] > '0') for g in v.samples] A.append(a) B.append(b) # compile genomic matrix gmx = np.array([A, B], dtype='uint8') # MAF fixing i = np.where(gmx.sum((0, 2)) > gmx.shape[2])[0] gmx[:, i, :] = 1 - gmx[:, i, :] # Allele order fix, make sure copy(a) >= copy(b) i = np.where(gmx[0, :, :] < gmx[1, :, :]) gmx[0, :, :][i] = 1 gmx[1, :, :][i] = 0 # dim_0: sample index; dim_1: copy number; dim_2: variant index gmx = gmx.transpose(2, 0, 1) return gmx, sbj
def loadVCF(vcf): """ Read genomic VCF file by name. Fix MAF and allele orders. return genomic matrix and subject IDs. """ # Python VCF reader: pip install pyvcf from vcf import Reader as vcfR # the two homogenuous chromosomes A, B = [], [] reader = vcfR(filename=vcf) sbj = reader.samples # subject IDs for v in reader: # copy #1 and #2 a = [int(g.gt_alleles[0] > '0') for g in v.samples] b = [int(g.gt_alleles[1] > '0') for g in v.samples] A.append(a) B.append(b) # compile genomic matrix gmx = np.array([A, B], dtype='uint8') # MAF fixing i = np.where(gmx.sum((0, 2)) > gmx.shape[2])[0] gmx[:, i, :] = 1 - gmx[:, i, :] # Allele order fix, make sure copy(a) >= copy(b) i = np.where(gmx[0, :, :] < gmx[1, :, :]) gmx[0, :, :][i] = 1 gmx[1, :, :][i] = 0 # dim_0: sample index; dim_1: copy number; dim_2: variant index gmx = gmx.transpose(2, 0, 1) return gmx, sbj
def __init__(self, vgz, chm=None, bp0=0, bp1=None, wnd=1024, dsg='012'): """ vgz: bgzipped VCF file, with tabix index. chm: the chromosome bp0: the starting basepair, 0 based, inclusive bp1: the ending basepair, 0 based, exclusive """ # record the parameters self.VGZ = vgz # find the first locus in the VCF file gv1 = next(vcfR(filename=vgz)) # VGZ reader point to the *.vcf.gz vgz = vcfR(filename=vgz) # the chromosome and its length chm = gv1.CHROM if chm is None else CKY[chm] cln = vgz.contigs[chm].length self.CHR = int(chm) # starting position bp0 = bp0 % cln # ending position if not bp1: bp1 = cln if bp1 < 0: bp1 = cln + bp1 # restrict VCF range vgz = vgz.fetch(chm, bp0, bp1) # private members self.__chm__ = chm self.__pos__ = bp0 # the pointer self.__vgz__ = vgz # vcf reader self.__bp0__ = bp0 # starting position self.__bp1__ = bp1 # ending position self.__wnd__ = wnd self.__dsg__ = dsg self.__pos__ = [] self.__vid__ = []