示例#1
0
    def asbatches(self, batchsize=64, reshuffle=False):
        n = len(self)
        assert n > 0
        nbatch = (n + batchsize - 1) // batchsize
        nseq = len(self.sequencenames)
        padding = self.requirements.get('padding', 0)
        batches = []
        for i in range(nbatch):

            # Slice a our data attributes row-wise, according to batch index
            batch = self[np.arange(i * batchsize, min(n, (i + 1) * batchsize))]
            batch._insert_reversecomplements()

            # Convert each sequence attribute from a list of strings ("GATC") to a
            # single contiguous numpy array X (0..3), along with a list of
            # regions R that identify the batch-relative offsets to the start/end
            # of each individual sequence
            for i in range(nseq):
                Xname, Rname = self._seqattrnames(i)
                batchX = getattr(batch, Xname)
                batchR = np.asarray(
                    np.cumsum([0] + [padding + len(x) for x in batchX]),
                    np.uint32).reshape((-1, 1))
                batchR = np.hstack([batchR[:-1], batchR[1:]])

                # Convert list of strings to giant contiguous array of integers 0..3,
                # with padding values of 255 put between the individual sequences
                batchX = acgt2ord(
                    ("." * padding).join([""] + [x for x in batchX] +
                                         [""])).reshape((-1, 1))

                # Convert each batch from numpy array to sarray,
                # and then quickly forget about the numpy batch
                batchX = sm.asarray(batchX)
                batchR = sm.asarray(batchR)

                setattr(batch, Xname, batchX)
                setattr(batch, Rname, batchR)
                setattr(batch, "regions", batchR)
                batch._data_attrs = batch._data_attrs + ("regions", )

            if hasattr(batch, "F") and batch.F is not None:
                batch.F = sm.asarray(batch.F, sm.get_default_dtype())
            if hasattr(batch, "Y") and batch.Y is not None:
                batch.Y = sm.asarray(batch.Y, sm.get_default_dtype())
                if isinstance(batch.Ymask, np.ndarray):
                    batch.Ymask = sm.asarray(batch.Ymask)

            batches.append(batch)
        return deepity.shuffled_repeat_iter(batches, reshuffle)
示例#2
0
文件: elemwise.py 项目: kawasakin/foo
    def _fprop(self,X):
        if X is None:
            return None
        if np.isscalar(self.rate):
            if self.rate == 0:
                return X
            self.rate = sm.asarray([self.rate for i in range(self.ninst)], dtype=X.dtype)
        elif not isinstance(self.rate, sm.sarray):
            self.rate = sm.asarray(self.rate,dtype=X.dtype)

        if "train_mode" in globals.flags:
            Z,self.M = _ext.dropout_fp_train(X, self.rate, "reverse_complement" in globals.flags)
        else:
            Z = _ext.dropout_fp_test(X, self.rate)
        return Z
示例#3
0
文件: data.py 项目: kawasakin/foo
    def asbatches(self, batchsize=64, reshuffle=False):
        n = len(self)
        assert n > 0
        nbatch = (n + batchsize - 1) // batchsize
        nseq = len(self.sequencenames)
        padding = self.requirements.get('padding',0)
        batches = []
        for i in range(nbatch):

            # Slice a our data attributes row-wise, according to batch index
            batch = self[np.arange(i*batchsize,min(n,(i+1)*batchsize))]
            batch._insert_reversecomplements()

            # Convert each sequence attribute from a list of strings ("GATC") to a 
            # single contiguous numpy array X (0..3), along with a list of
            # regions R that identify the batch-relative offsets to the start/end 
            # of each individual sequence
            for i in range(nseq):
                Xname,Rname = self._seqattrnames(i)
                batchX = getattr(batch, Xname)
                batchR = np.asarray(np.cumsum([0]+[padding+len(x) for x in batchX]),np.uint32).reshape((-1,1))
                batchR = np.hstack([batchR[:-1],batchR[1:]])

                # Convert list of strings to giant contiguous array of integers 0..3, 
                # with padding values of 255 put between the individual sequences
                batchX = acgt2ord(("."*padding).join([""]+[x for x in batchX]+[""])).reshape((-1,1))

                # Convert each batch from numpy array to sarray, 
                # and then quickly forget about the numpy batch
                batchX = sm.asarray(batchX)
                batchR = sm.asarray(batchR)

                setattr(batch, Xname, batchX)
                setattr(batch, Rname, batchR)
                setattr(batch, "regions", batchR)
                batch._data_attrs = batch._data_attrs + ("regions",)

            if hasattr(batch,"F") and batch.F is not None: 
                batch.F = sm.asarray(batch.F,sm.get_default_dtype())
            if hasattr(batch,"Y") and batch.Y is not None:
                batch.Y = sm.asarray(batch.Y,sm.get_default_dtype())
                if isinstance(batch.Ymask,np.ndarray):
                    batch.Ymask = sm.asarray(batch.Ymask)

            batches.append(batch)
        return deepity.shuffled_repeat_iter(batches, reshuffle)
示例#4
0
    def _fprop(self, X):
        if X is None:
            return None
        if np.isscalar(self.rate):
            if self.rate == 0:
                return X
            self.rate = sm.asarray([self.rate for i in range(self.ninst)],
                                   dtype=X.dtype)
        elif not isinstance(self.rate, sm.sarray):
            self.rate = sm.asarray(self.rate, dtype=X.dtype)

        if "train_mode" in globals.flags:
            Z, self.M = _ext.dropout_fp_train(
                X, self.rate, "reverse_complement" in globals.flags)
        else:
            Z = _ext.dropout_fp_test(X, self.rate)
        return Z
示例#5
0
    def _fprop(self,X,R):
        if deepity.globals.flags.get("collect_featuremaps",False):
            old = deepity.globals.flags.pop("collect_featuremaps")
            fmaps = old if isinstance(old, list) else [] 
            fmaps += self._collect_featuremaps(X,R)
            deepity.globals.flags.push("collect_featuremaps",fmaps)

        Z,self.I = kangaroo_smat.poolrgn(X,R,ptype="max",
                                         want_argmax = "bprop_mode" in deepity.globals.flags)

        
        if "collect_argmax" in deepity.globals.flags:
            deepity.globals.flags.pop("collect_argmax")
            deepity.globals.flags.push("collect_argmax", self.I.asnumpy())
        elif "force_argmax" in deepity.globals.flags:
            _I = deepity.globals.flags.get("force_argmax")
            _X = X.asnumpy()
            _Z = _X.ravel()[_I.ravel()].reshape(Z.shape)
            Z = sm.asarray(_Z)

        return Z
示例#6
0
文件: predict.py 项目: kawasakin/foo
def gen_convnet_predictions(model, data, want_gmaps=False):
    # We must feed each sequence through the model several times
    # by applying the model repeatedly on sliding a window along the sequence.
    # That generates a prediction map, from which we can take max, sum, etc.
    predictions = []
    gmaps = {}
    batches = data.asbatches(batchsize=2048, reshuffle=False)
    for batch in batches:
        args = batch.input_data()
        args["want_bprop_inputs"] = bool(want_gmaps)
        if isinstance(model.Z.origin().node,deepity.std.softmaxnode):
            args["bprop_inputs_loss"] = deepity.std.nll()
        else:
            args["bprop_inputs_loss"] = deepity.std.mse()
        globals.flags.push("collect_argmax",None)
        outputs = model.eval(**args)
        I = globals.flags.pop("collect_argmax")
        Z = outputs['Z'].asnumpy()
        Z, Zmask = maskout_revcomp(Z)
        if Zmask is not None:
            if "collect_Zmask" in globals.flags:
                global_Zmask = globals.flags.pop("collect_Zmask")
                if not isinstance(global_Zmask,np.ndarray):
                    global_Zmask = Zmask
                else:
                    global_Zmask = np.vstack([global_Zmask, Zmask])
                globals.flags.push("collect_Zmask", global_Zmask)
        predictions.append(Z)

        # If user wants gradientmaps, then for every sequence we need one
        if want_gmaps:
            for key in args:
                dkey = "d"+key
                if outputs.get(dkey,None) is not None:
                    X = args[key].asnumpy()
                    dX = outputs[dkey].asnumpy()
                    if X.dtype == np.uint8: # Is it an sequence of ordinals (bytes)?
                        pad = data.requirements.get("padding",0)
                        R = args["R"+key[1:]].asnumpy() # regions associated with X
                        
                        if want_gmaps == "finite_diff":
                            is_rc = "reverse_complement" in globals.flags
                            #globals.flags.push("force_argmax",I)
                            rcindex = [3,2,1,0]
                            oldF = args['F']
                            # If user specifically asked for finite differences, not instantaneous gradient,
                            # then we need to explicitly mutate every position, generate predictions, and
                            # subtract the result from Z to find the actual delta for each base
                            Xlen = R[:,1]-R[:,0]
                            nbase = dX.shape[1]
                            for i in range(Xlen.max()):
                                for j in range(nbase):
                                    mtX = X.copy()
                                    mtF = args['F'].asnumpy().copy()
                                    for k in range(len(R)):
                                        a,b = R[k]
                                        if i < b-a:
                                            if (k % 2 == 0) or not is_rc:
                                                mtX[pad+a+i] = j  # mutate position i in sequence k (which starts at byte index a) to base j
                                            else:
                                                mtX[b-i-1] = rcindex[j]
                                        mtF[k] = data._generate_dinuc_featurevec(mtX[pad+a:b])
                                        
                                    args[key] = sm.asarray(mtX) # This time use the mutated X instead of the original
                                    args['F'] = sm.asarray(mtF)
                                    mtoutputs = model.eval(**args)
                                    mtZ = mtoutputs['Z'].asnumpy()
                                    mtZ, mtZmask = maskout_revcomp(mtZ)
                                    dZ = mtZ-Z # output 
                                    dZ *= np.maximum(0,np.sign(np.maximum(Z,mtZ)))
                                    for k in range(len(R)):
                                        if (k % 2 == 0) or not is_rc:
                                            a,b = R[k]
                                            if i < b-a:
                                                dX[pad+a+i,j] = dZ[(k//2) if is_rc else k]
                            #globals.flags.pop("force_argmax")
                            args['F'] = oldF
                            
                            # Only include forward strand in finite_diff results
                            if is_rc:
                                dX = [(util.ord2acgt(X[a+pad:b]), dX[a+pad:b]) for a,b in R[np.arange(0,len(R),2)]]
                            else:
                                dX = [(util.ord2acgt(X[a+pad:b]), dX[a+pad:b]) for a,b in R]
                        else:
                            dX = [(util.ord2acgt(X[a+pad:b]), dX[a+pad:b]) for a,b in R]
                            if Zmask is not None:
                                dX = [dX[i] for i in range(len(dX)) if Zmask[i]]

                    else:
                        if Zmask is not None:
                            X = X[Zmask.ravel()]
                            dX = dX[Zmask.ravel()]
                        dX *= np.maximum(0,Z)
                        dX = [(X[i], dX[i]) for i in range(len(dX))]

                    if dkey not in gmaps:
                        gmaps[dkey] = []
                    gmaps[dkey] += dX

    # Concatenate all numpy arrays if they're the same size
    predictions = np.vstack(predictions)

    return (predictions, gmaps) if want_gmaps else (predictions, None)
示例#7
0
文件: data.py 项目: yynst2/DeepBind
 def convert_to_sarray(self):
     # Upload the data to a GPU device
     for name in self._data_attrs:
         oldattr = getattr(self, name)
         setattr(self, name, sm.asarray(oldattr))
示例#8
0
def gen_convnet_predictions(model, data, want_gmaps=False):
    # We must feed each sequence through the model several times
    # by applying the model repeatedly on sliding a window along the sequence.
    # That generates a prediction map, from which we can take max, sum, etc.
    predictions = []
    gmaps = {}
    batches = data.asbatches(batchsize=2048, reshuffle=False)
    for batch in batches:
        args = batch.input_data()
        args["want_bprop_inputs"] = bool(want_gmaps)
        if isinstance(model.Z.origin().node, deepity.std.softmaxnode):
            args["bprop_inputs_loss"] = deepity.std.nll()
        else:
            args["bprop_inputs_loss"] = deepity.std.mse()
        globals.flags.push("collect_argmax", None)
        outputs = model.eval(**args)
        I = globals.flags.pop("collect_argmax")
        Z = outputs['Z'].asnumpy()
        Z, Zmask = maskout_revcomp(Z)
        if Zmask is not None:
            if "collect_Zmask" in globals.flags:
                global_Zmask = globals.flags.pop("collect_Zmask")
                if not isinstance(global_Zmask, np.ndarray):
                    global_Zmask = Zmask
                else:
                    global_Zmask = np.vstack([global_Zmask, Zmask])
                globals.flags.push("collect_Zmask", global_Zmask)
        predictions.append(Z)

        # If user wants gradientmaps, then for every sequence we need one
        if want_gmaps:
            for key in args:
                dkey = "d" + key
                if outputs.get(dkey, None) is not None:
                    X = args[key].asnumpy()
                    dX = outputs[dkey].asnumpy()
                    if X.dtype == np.uint8:  # Is it an sequence of ordinals (bytes)?
                        pad = data.requirements.get("padding", 0)
                        R = args[
                            "R" +
                            key[1:]].asnumpy()  # regions associated with X

                        if want_gmaps == "finite_diff":
                            is_rc = "reverse_complement" in globals.flags
                            #globals.flags.push("force_argmax",I)
                            rcindex = [3, 2, 1, 0]
                            oldF = args['F']
                            # If user specifically asked for finite differences, not instantaneous gradient,
                            # then we need to explicitly mutate every position, generate predictions, and
                            # subtract the result from Z to find the actual delta for each base
                            Xlen = R[:, 1] - R[:, 0]
                            nbase = dX.shape[1]
                            for i in range(Xlen.max()):
                                for j in range(nbase):
                                    mtX = X.copy()
                                    mtF = args['F'].asnumpy().copy()
                                    for k in range(len(R)):
                                        a, b = R[k]
                                        if i < b - a:
                                            if (k % 2 == 0) or not is_rc:
                                                mtX[pad + a +
                                                    i] = j  # mutate position i in sequence k (which starts at byte index a) to base j
                                            else:
                                                mtX[b - i - 1] = rcindex[j]
                                        mtF[k] = data._generate_dinuc_featurevec(
                                            mtX[pad + a:b])

                                    args[key] = sm.asarray(
                                        mtX
                                    )  # This time use the mutated X instead of the original
                                    args['F'] = sm.asarray(mtF)
                                    mtoutputs = model.eval(**args)
                                    mtZ = mtoutputs['Z'].asnumpy()
                                    mtZ, mtZmask = maskout_revcomp(mtZ)
                                    dZ = mtZ - Z  # output
                                    dZ *= np.maximum(
                                        0, np.sign(np.maximum(Z, mtZ)))
                                    for k in range(len(R)):
                                        if (k % 2 == 0) or not is_rc:
                                            a, b = R[k]
                                            if i < b - a:
                                                dX[pad + a + i,
                                                   j] = dZ[(k //
                                                            2) if is_rc else k]
                            #globals.flags.pop("force_argmax")
                            args['F'] = oldF

                            # Only include forward strand in finite_diff results
                            if is_rc:
                                dX = [(util.ord2acgt(X[a + pad:b]),
                                       dX[a + pad:b])
                                      for a, b in R[np.arange(0, len(R), 2)]]
                            else:
                                dX = [(util.ord2acgt(X[a + pad:b]),
                                       dX[a + pad:b]) for a, b in R]
                        else:
                            dX = [(util.ord2acgt(X[a + pad:b]), dX[a + pad:b])
                                  for a, b in R]
                            if Zmask is not None:
                                dX = [
                                    dX[i] for i in range(len(dX)) if Zmask[i]
                                ]

                    else:
                        if Zmask is not None:
                            X = X[Zmask.ravel()]
                            dX = dX[Zmask.ravel()]
                        dX *= np.maximum(0, Z)
                        dX = [(X[i], dX[i]) for i in range(len(dX))]

                    if dkey not in gmaps:
                        gmaps[dkey] = []
                    gmaps[dkey] += dX

    # Concatenate all numpy arrays if they're the same size
    predictions = np.vstack(predictions)

    return (predictions, gmaps) if want_gmaps else (predictions, None)
示例#9
0
文件: data.py 项目: kawasakin/foo
 def convert_to_sarray(self):
     # Upload the data to a GPU device
     for name in self._data_attrs:
         oldattr = getattr(self,name)
         setattr(self,name,sm.asarray(oldattr))
示例#10
0
    def _train_setup(self, trainable_plugs, cost):

        # For each trainable plug, figure out how many weights it needs.
        sizes  = [ np.prod(p.shape)*cost.ninst for p in trainable_plugs ]
        offsets = np.asarray(np.cumsum([0] + [ size for size in sizes ]),np.uint32)

        # Allocate giant contiguous arrays for P, dP, and mP
        P  = sm.zeros((offsets[-1],1))
        dP = sm.zeros_like(P)
        mP = sm.zeros_like(P)

        # Per-inst learn rates / momentum rates go here.
        # Giant contiguous array maps to same indicies as in P, dP, mP
        drate = sm.zeros_like(P)
        mrate = sm.zeros_like(P)

        trnodes = []

        # For each plug, create a trainable node that is bound to 
        # a chunk of our P (parameter) and dP (gradient) vectors, where the node can 
        for i,tplug in enumerate(trainable_plugs):

            # Grow the actual shape of the trainable parameters, using the
            # axis specified by the trainable plug.
            shape = list(tplug.shape)
            shape[tplug.inst_axis] *= tplug.node.ninst

            # Allocate a new trainable node, and connect it to the plug
            trnode = trainable(P[offsets[i]:offsets[i+1]].reshape(tuple(shape)),
                              dP[offsets[i]:offsets[i+1]].reshape(tuple(shape)))
            trnode >> tplug
            trnodes.append(trnode)

            # Assign instance-specific learning rates and momentum rates
            # to each corresponding element in the giant drate/mrate vectors
            if tplug.inst_axis == 0:
                k = np.prod(tplug.shape)
            else:
                k = tplug.shape[1]
            dratevec = drate[offsets[i]:offsets[i+1]]
            mratevec = mrate[offsets[i]:offsets[i+1]]
            _ext.madd_bcast(sm.ones_like(dratevec),self.rate,k,dratevec)
            _ext.madd_bcast(sm.ones_like(mratevec),self.momentum,k,mratevec)

            # Also initialize elements of P based on the trainable plug's initialization scale,
            # which can be different for each individual instance
            Pvec = P[offsets[i]:offsets[i+1]]
            initval = tplug.origin().node.init
            if isinstance(initval, np.ndarray) and initval.ndim == 3:
                # Specific initialization of individual filters
                Pvec[:] = sm.asarray(np.require(np.rollaxis(initval,1),requirements="C").reshape((-1,1)))
            else:
                # Random initialization
                _ext.madd_bcast(sm.randn(Pvec.shape[0],Pvec.shape[1]),
                                initval,k,Pvec)

            if hasattr(tplug.origin().node,'init_mu'):
                initmu_val = tplug.origin().node.init_mu
                if isinstance(initmu_val, list):
                    # Specific initialization of individual bias elements
                    initmu_val = np.tile(initmu_val,tplug.origin().node.ninst)
                    Pvec[:] = sm.asarray(initmu_val).reshape(Pvec.shape)
                else:
                    _ext.madd_bcast(sm.ones_like(Pvec),
                                    tplug.origin().node.init_mu,k,Pvec)  # Add shift
                

        return (P,dP,mP,drate,mrate,trnodes)