Exemplo n.º 1
0
    def __init__(self,
                 vgz,
                 chm=None,
                 bp0=None,
                 bp1=None,
                 wnd=1024,
                 dsg='012',
                 dtp=int):
        """
        vgz: name of the bgzipped VCF file (*.vcf.gz), if None,
        only the reference genome will be returned.

        chm: chromosome to be sampled, zero based.
        bp0: initial position in the chromosome, zero based.
        wnd: sample window size, (defaut=1024).
        dsg: encoding for alternative allele count 0, 1, and 2; use '012' for
        additive, '022' for dominative, and '002' for recessive encoding. The
        default scheme is addtive, '012'.
        """
        self.VGZ = vgz
        # find the first locus in the VCF file
        gv1 = next(vcfR(filename=vgz))

        # VGZ reader point to the *.vcf.gz
        vgz = vcfR(filename=vgz)

        # the chromosome and its length
        chm = gv1.CHROM if chm is None else CKY[chm]
        cln = vgz.contigs[chm].length
        self.CHR = int(chm)

        # starting position
        if not bp0:
            bp0 = gv1.start
        elif bp0 < 0:
            bp0 = gv1.start + bp0
        self.BP0 = bp0

        # ending position
        if not bp1:
            bp1 = cln
        elif bp1 < 0:
            bp1 = cln + bp1
        self.BP1 = bp1

        # private members
        self.__vgz__ = vgz
        self.__gvr__ = gv1
        self.__chm__ = chm
        self.__bp0__ = bp0
        self.__bp1__ = bp1
        self.__wnd__ = wnd
        self.__dsg__ = dsg
        self.__dtp__ = dtp
        self.__mem__ = []
        self.__pos__ = []
        self.__vid__ = []
Exemplo n.º 2
0
    def __init__(self, vgz, chm=None, bp0=None, bp1=None, sbj=None):
        """
        vgz: bgzipped VCF file, with tabix index.
        fsq: FASTQ file storing the reference genome
        chm: the chromosome
        bp0: the starting basepair, 0 based, inclusive
        bp1: the ending basepair, 0 based, exclusive
        """
        # record the parameters
        self.VGZ = vgz

        # find the first locus in the VCF file
        gv1 = next(vcfR(filename=vgz))

        # VGZ reader point to the *.vcf.gz
        vgz = vcfR(filename=vgz)

        # the chromosome and its length
        chm = gv1.CHROM if chm is None else CKY[chm]
        cln = vgz.contigs[chm].length

        # starting position
        if not bp0:
            bp0 = gv1.start
        elif bp0 < 0:
            bp0 = gv1.start + bp0

        # ending position
        if not bp1:
            bp1 = cln
        elif bp1 < 0:
            bp1 = cln + bp1

        # restrict VCF range
        vgz = vgz.fetch(chm, bp0, bp1)

        # find the first locus in the region
        gvr = None
        while gvr is None:
            try:
                gvr = next(vgz)
            except StopIteration:
                break
            if gvr.start < bp0:
                gvr = None

        # private members
        self.__pos__ = bp0  # the pointer
        self.__gvr__ = gvr  # the variant
        self.__vgz__ = vgz  # vcf reader
        self.__bp0__ = bp0  # starting position
        self.__bp1__ = bp1  # ending position
Exemplo n.º 3
0
def loadVCF(vcf):
    """ Read genomic VCF file by name. Fix MAF and allele orders.
    return genomic matrix and subject IDs. The function loads the
    entire VCF into memory, so it is not recommended for huge VCF.
    """
    # Python VCF reader: pip install pyvcf
    from vcf import Reader as vcfR

    # the two homogenuous chromosomes
    A, B = [], []

    reader = vcfR(filename=vcf)
    sbj = reader.samples        # subject IDs
    for v in reader:
        # copy #1 and #2
        a = [int(g.gt_alleles[0] > '0') for g in v.samples]
        b = [int(g.gt_alleles[1] > '0') for g in v.samples]
        A.append(a)
        B.append(b)

    # compile genomic matrix
    gmx = np.array([A, B], dtype='uint8')

    # MAF fixing
    i = np.where(gmx.sum((0, 2)) > gmx.shape[2])[0]
    gmx[:, i, :] = 1 - gmx[:, i, :]

    # Allele order fix, make sure copy(a) >= copy(b)
    i = np.where(gmx[0, :, :] < gmx[1, :, :])
    gmx[0, :, :][i] = 1
    gmx[1, :, :][i] = 0

    # dim_0: sample index; dim_1: copy number; dim_2: variant index
    gmx = gmx.transpose(2, 0, 1)
    return gmx, sbj
Exemplo n.º 4
0
def loadVCF(vcf):
    """ Read genomic VCF file by name.
    Fix MAF and allele orders.
    return genomic matrix and subject IDs.
    """
    # Python VCF reader: pip install pyvcf
    from vcf import Reader as vcfR

    # the two homogenuous chromosomes
    A, B = [], []

    reader = vcfR(filename=vcf)
    sbj = reader.samples  # subject IDs
    for v in reader:
        # copy #1 and #2
        a = [int(g.gt_alleles[0] > '0') for g in v.samples]
        b = [int(g.gt_alleles[1] > '0') for g in v.samples]
        A.append(a)
        B.append(b)

    # compile genomic matrix
    gmx = np.array([A, B], dtype='uint8')

    # MAF fixing
    i = np.where(gmx.sum((0, 2)) > gmx.shape[2])[0]
    gmx[:, i, :] = 1 - gmx[:, i, :]

    # Allele order fix, make sure copy(a) >= copy(b)
    i = np.where(gmx[0, :, :] < gmx[1, :, :])
    gmx[0, :, :][i] = 1
    gmx[1, :, :][i] = 0

    # dim_0: sample index; dim_1: copy number; dim_2: variant index
    gmx = gmx.transpose(2, 0, 1)
    return gmx, sbj
Exemplo n.º 5
0
    def __init__(self, vgz, chm=None, bp0=0, bp1=None, wnd=1024, dsg='012'):
        """
        vgz: bgzipped VCF file, with tabix index.
        chm: the chromosome
        bp0: the starting basepair, 0 based, inclusive
        bp1: the ending basepair, 0 based, exclusive
        """
        # record the parameters
        self.VGZ = vgz

        # find the first locus in the VCF file
        gv1 = next(vcfR(filename=vgz))

        # VGZ reader point to the *.vcf.gz
        vgz = vcfR(filename=vgz)

        # the chromosome and its length
        chm = gv1.CHROM if chm is None else CKY[chm]
        cln = vgz.contigs[chm].length
        self.CHR = int(chm)

        # starting position
        bp0 = bp0 % cln

        # ending position
        if not bp1:
            bp1 = cln
        if bp1 < 0:
            bp1 = cln + bp1

        # restrict VCF range
        vgz = vgz.fetch(chm, bp0, bp1)

        # private members
        self.__chm__ = chm
        self.__pos__ = bp0  # the pointer
        self.__vgz__ = vgz  # vcf reader
        self.__bp0__ = bp0  # starting position
        self.__bp1__ = bp1  # ending position
        self.__wnd__ = wnd
        self.__dsg__ = dsg
        self.__pos__ = []
        self.__vid__ = []