Пример #1
0
    def read_with_specification(snpset_withbbed,
                                order="F",
                                dtype=SP.float64,
                                force_python_only=False):
        # doesn't need to self.run_once() because it is static
        decide_once_on_plink_reader()
        global WRAPPED_PLINK_PARSER_PRESENT

        bed = snpset_withbbed.bed
        iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes(
            snpset_withbbed)

        if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            SNPs = SP.zeros((iid_count_out, snp_count_out),
                            order=order,
                            dtype=dtype)
            bed_fn = bed.basefilename + ".bed"

            if dtype == SP.float64:
                if order == "F":
                    wrap_plink_parser.readPlinkBedFiledoubleFAAA(
                        bed_fn, iid_count_in, snp_count_in, iid_index_out,
                        snp_index_out, SNPs)
                elif order == "C":
                    wrap_plink_parser.readPlinkBedFiledoubleCAAA(
                        bed_fn, iid_count_in, snp_count_in, iid_index_out,
                        snp_index_out, SNPs)
                else:
                    raise Exception(
                        "order '{0}' not known, only 'F' and 'C'".format(
                            order))
            elif dtype == SP.float32:
                if order == "F":
                    wrap_plink_parser.readPlinkBedFilefloatFAAA(
                        bed_fn, iid_count_in, snp_count_in, iid_index_out,
                        snp_index_out, SNPs)
                elif order == "C":
                    wrap_plink_parser.readPlinkBedFilefloatCAAA(
                        bed_fn, iid_count_in, snp_count_in, iid_index_out,
                        snp_index_out, SNPs)
                else:
                    raise Exception(
                        "dtype '{0}' not known, only float64 and float32".
                        format(dtype))

        else:
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code.
            # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too.
            logging.warn(
                "using pure python plink parser (might be much slower!!)")
            SNPs = SP.zeros(
                ((int(SP.ceil(0.25 * iid_count_in)) * 4), snp_count_out),
                order=order,
                dtype=dtype)  #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(snpset_withbbed):

                startbit = int(SP.ceil(0.25 * iid_count_in) * bimIndex + 3)
                bed._filepointer.seek(startbit)
                nbyte = int(SP.ceil(0.25 * iid_count_in))
                bytes = SP.array(bytearray(
                    bed._filepointer.read(nbyte))).reshape(
                        (int(SP.ceil(0.25 * iid_count_in)), 1), order='F')

                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = SP.nan
                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1
                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = 2
                bytes = SP.mod(bytes, 64)
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = SP.nan
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = 2
                bytes = SP.mod(bytes, 16)
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = SP.nan
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = 2
                bytes = SP.mod(bytes, 4)
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = SP.nan
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = 2
            SNPs = SNPs[
                iid_index_out, :]  #reorder or trim any extra allocation

        ret = {
            'rs': bed.rs[snp_index_out],
            'pos': bed.pos[snp_index_out, :],
            'snps': SNPs,
            'iid': bed.original_iids[iid_index_out, :]
        }
        return ret
Пример #2
0
    def read_with_specification(snpset_withbbed, order="F", dtype=SP.float64, force_python_only=False):
        # doesn't need to self.run_once() because it is static
        decide_once_on_plink_reader()
        global WRAPPED_PLINK_PARSER_PRESENT

        bed = snpset_withbbed.bed
        iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes(snpset_withbbed)

        if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            SNPs = SP.zeros((iid_count_out, snp_count_out), order=order, dtype=dtype)
            bed_fn = bed.basefilename + ".bed"

            if dtype == SP.float64:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFiledoubleFAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFiledoubleCAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
            elif dtype == SP.float32:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFilefloatFAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFilefloatCAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype))
            
        else:
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code. 
            # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too.       
            logging.warn("using pure python plink parser (might be much slower!!)")
            SNPs = SP.zeros(((int(SP.ceil(0.25*iid_count_in))*4),snp_count_out),order=order, dtype=dtype) #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(snpset_withbbed):

                startbit = int(SP.ceil(0.25*iid_count_in)*bimIndex+3)
                bed._filepointer.seek(startbit)
                nbyte = int(SP.ceil(0.25*iid_count_in))
                bytes = SP.array(bytearray(bed._filepointer.read(nbyte))).reshape((int(SP.ceil(0.25*iid_count_in)),1),order='F')

                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=SP.nan
                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1
                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2
                bytes=SP.mod(bytes,64)
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=SP.nan
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2
                bytes=SP.mod(bytes,16)
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=SP.nan
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2
                bytes=SP.mod(bytes,4)
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=SP.nan
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2
            SNPs = SNPs[iid_index_out,:] #reorder or trim any extra allocation

        ret = {
                'rs'     :bed.rs[snp_index_out],
                'pos'    :bed.pos[snp_index_out,:],
                'snps'   :SNPs,
                'iid'    :bed.original_iids[iid_index_out,:]
                }
        return ret
Пример #3
0
    def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok):
        self._run_once()
        
        if order=='A':
            order='F'

        assert not hasattr(self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute"

        iid_count_in = self.iid_count
        sid_count_in = self.sid_count

        if iid_index_or_none is not None:
            iid_count_out = len(iid_index_or_none)
            iid_index_out = iid_index_or_none
        else:
            iid_count_out = iid_count_in
            iid_index_out = range(iid_count_in)

        if sid_index_or_none is not None:
            sid_count_out = len(sid_index_or_none)
            sid_index_out = sid_index_or_none
        else:
            sid_count_out = sid_count_in
            sid_index_out = range(sid_count_in)

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype)
            bed_fn = SnpReader._name_of_other_file(self.filename,"bed","bed")

            if dtype == np.float64:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFiledoubleFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFiledoubleCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val)
                else:
                    raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
            elif dtype == np.float32:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFilefloatFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFilefloatCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val)
                else:
                    raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
            else:
                raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype))
            
        else:
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code. 
            # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too.       
            logging.warn("using pure python plink parser (might be much slower!!)")
            val = np.zeros(((int(np.ceil(0.25*iid_count_in))*4),sid_count_out),order=order, dtype=dtype) #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(sid_index_out):

                startbit = int(np.ceil(0.25*iid_count_in)*bimIndex+3)
                self._filepointer.seek(startbit)
                nbyte = int(np.ceil(0.25*iid_count_in))
                bytes = np.array(bytearray(self._filepointer.read(nbyte))).reshape((int(np.ceil(0.25*iid_count_in)),1),order='F')

                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=np.nan
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2
                bytes=np.mod(bytes,64)
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=np.nan
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2
                bytes=np.mod(bytes,16)
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2
                bytes=np.mod(bytes,4)
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2
            val = val[iid_index_out,:] #reorder or trim any extra allocation


            #!!LATER this can fail because the trim statement above messes up the order
            #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!!

        return val