示例#1
0
    def make_H(self,a_active_H=10,b_active_H=0.1,n=100):
        """
        a_active_H: scalar, shape parameter of Gamma distribution.
        b_active_H: scalar, rate parameter of Gamma distribution.
        n: scalar, number of examples for each class.
        """
        self.a_active_H=a_active_H
        self.b_active_H=b_active_H
        if isinstance(n,tuple) or isinstance(n,list):
            self.n_list=n
            self.n=cl.factor_sizes_to_factor_labels(n,start=0) # e.g. (2,3,4,3) to [0,0,1,1,1,2,2,2,2,3,3,3]
        else:
            self.n_list=[n]*(self.V)
            self.n=cl.factor_sizes_to_factor_labels(self.n_list,start=0) # e.g. 3 to [0,0,0,1,1,1,2,2,2,3,3,3]
        self.N=len(self.n) # number of samples 
        self.C,_=cl.membership_vector_to_indicator_matrix(self.n) # class membership matrix, N times V
        self.C=numpy.vstack((numpy.ones(shape=(1,self.N),dtype=int),numpy.transpose(self.C))) # V+1 times N
        #KN=numpy.dot(self.Z,self.C)
        self.Lambda_H=numpy.zeros(shape=(self.V+1,self.N),dtype=float)
        ls=self.rng.gamma(shape=self.a_active_H, scale=1/self.b_active_H, size=(self.V+1,self.N))
        self.C=numpy.asarray(self.C,dtype=bool)
        self.Lambda_H[self.C]=ls[self.C]
        self.Lambda_H_ext=numpy.dot(self.Z,self.Lambda_H) # K times N
        KN=numpy.dot(self.Z,self.C)
        self.H=numpy.zeros(shape=(self.K,self.N),dtype=float)
        for k in range(self.K):
            for n in range(self.N):
                if KN[k,n]:
                    self.H[k,n]=self.rng.exponential(scale=1/self.Lambda_H_ext[k,n],size=None)

        #print self.H
        self.classes=self.n
       # print self.classes
        return self.H,self.n
示例#2
0
    def make_W(self,a_active_W=10,b_active_W=1000):
        """
        Make the real basis matrix W.
        a_active_W: scalar, shape parameter of Gamma distribution.
        b_active_W: scalar, rate parameter of Gamma distribution.
        """
        self.a_active_W=a_active_W
        self.b_active_W=b_active_W
        self.ubc=unique_binary_code.unique_binary_code(self.V+1)
        self.ubc.generate_binary_code()
        self.ubc.s
        self.s_str=utility.convert_each_row_of_matrix_to_a_string(self.ubc.s,sep="")
        self.num_patterns=len(self.s_str)
        self.M=self.m*self.num_patterns
        self.W=numpy.zeros(shape=(self.M,self.K),dtype=float)
        self.Lambda_W=numpy.zeros(shape=(self.M,self.V+1),dtype=float )
        ls=self.rng.gamma(shape=self.a_active_W, scale=1/self.b_active_W, size=(self.M,self.V+1))
        mp=cl.factor_sizes_to_factor_labels([self.m]*self.num_patterns) # [3,2,4] -> [-1,-1,-1,0,0,1,1,1]
        MP,_=cl.membership_vector_to_indicator_matrix(mp)
        #print MP
        #print self.ubc.s
        self.S=numpy.dot(MP,self.ubc.s) # extend binary codes, M times V+1
        self.S=numpy.asarray(self.S,dtype=bool)
        self.Lambda_W[self.S]=ls[self.S]
        #self.features=numpy.empty(shape=(self.M,),dtype=str)
        self.features=["features"]*self.M # names of features
	self.feature_patterns=["feature_patterns"]*self.M # pattern of features
        self.feature_patterns_matrix=numpy.zeros(shape=(self.M,self.V+1),dtype=int)
        fs=range(0,self.m)*self.num_patterns # [0,1,2,0,1,2,0,1,2,...,0,1,2]
        #print self.Lambda_W
        for i in range(self.M):
            code=numpy.asarray(self.S[i,:],dtype=int)
            self.Z=numpy.asarray(self.Z,dtype=int)
            code.shape=(len(code),1) # V+1 times 1
            code_ext=self.Z.dot(code) # K times 1
            code_ext=numpy.asarray(code_ext,dtype=bool)
            code_ext.shape=(len(code_ext),)
            code.shape=(len(code),)
            self.features[i]="".join(numpy.asarray(code,dtype=str))+"_"+str(fs[i])
            self.feature_patterns[i]="".join(numpy.asarray(code,dtype=str))
            self.feature_patterns_matrix[i,:]=code
            code=numpy.asarray(code,dtype=bool)
            #num_active_views=numpy.sum(code)
            w=[]
            for v in range(self.V+1):
                if self.S[i,v]:
                    w=numpy.concatenate((w,self.rng.exponential(scale=1/self.Lambda_W[i,v],size=self.z_list[v])))
            self.W[i,code_ext]=w
            
        #print self.W
        #print self.features
        return self.W,self.features
示例#3
0
    def make_H(self, a_active_H=10, b_active_H=0.1, n=100):
        """
        a_active_H: scalar, shape parameter of Gamma distribution.
        b_active_H: scalar, rate parameter of Gamma distribution.
        n: scalar, number of examples for each class.
        """
        self.a_active_H = a_active_H
        self.b_active_H = b_active_H
        if isinstance(n, tuple) or isinstance(n, list):
            self.n_list = n
            self.n = cl.factor_sizes_to_factor_labels(
                n, start=0)  # e.g. (2,3,4,3) to [0,0,1,1,1,2,2,2,2,3,3,3]
        else:
            self.n_list = [n] * (self.V)
            self.n = cl.factor_sizes_to_factor_labels(
                self.n_list, start=0)  # e.g. 3 to [0,0,0,1,1,1,2,2,2,3,3,3]
        self.N = len(self.n)  # number of samples
        self.C, _ = cl.membership_vector_to_indicator_matrix(
            self.n)  # class membership matrix, N times V
        self.C = numpy.vstack(
            (numpy.ones(shape=(1, self.N),
                        dtype=int), numpy.transpose(self.C)))  # V+1 times N
        #KN=numpy.dot(self.Z,self.C)
        self.Lambda_H = numpy.zeros(shape=(self.V + 1, self.N), dtype=float)
        ls = self.rng.gamma(shape=self.a_active_H,
                            scale=1 / self.b_active_H,
                            size=(self.V + 1, self.N))
        self.C = numpy.asarray(self.C, dtype=bool)
        self.Lambda_H[self.C] = ls[self.C]
        self.Lambda_H_ext = numpy.dot(self.Z, self.Lambda_H)  # K times N
        KN = numpy.dot(self.Z, self.C)
        self.H = numpy.zeros(shape=(self.K, self.N), dtype=float)
        for k in range(self.K):
            for n in range(self.N):
                if KN[k, n]:
                    self.H[k, n] = self.rng.exponential(
                        scale=1 / self.Lambda_H_ext[k, n], size=None)

        #print self.H
        self.classes = self.n
        # print self.classes
        return self.H, self.n
示例#4
0
 def __init__(self,z=3,V=3,m=3,rng=numpy.random.RandomState(1000)):
     """
     z: integer,tuple, list, or numpy.ndarray, the number of hidden factors for each view;
     V: integer, number of views.
     m: integer, number of features for each pattern.
     rng: random state.
     """
     self.V=V
     self.m=m
     self.rng=rng
     
     if isinstance(z,tuple) or isinstance(z,list):
         self.z_list=z
         self.z=cl.factor_sizes_to_factor_labels(z) # e.g. (3,3,3,3) to [-1,-1,-1,0,0,0,1,1,2,2,2]
     else:
         self.z_list=[z]*(self.V+1)
         self.z=cl.factor_sizes_to_factor_labels(self.z_list) # e.g. 3 to [-1,-1,-1,0,0,0,1,1,2,2,2]
     #print self.z
     self.K=len(self.z) # number of latent factors 
     self.Z,self.z_unique=cl.membership_vector_to_indicator_matrix(self.z) # binary, size K by V+1, self.Z[k,u]=1 indicates the k-th factor in class u.
示例#5
0
    def __init__(self, z=3, V=3, m=3, rng=numpy.random.RandomState(1000)):
        """
        z: integer,tuple, list, or numpy.ndarray, the number of hidden factors for each view;
        V: integer, number of views.
        m: integer, number of features for each pattern.
        rng: random state.
        """
        self.V = V
        self.m = m
        self.rng = rng

        if isinstance(z, tuple) or isinstance(z, list):
            self.z_list = z
            self.z = cl.factor_sizes_to_factor_labels(
                z)  # e.g. (3,3,3,3) to [-1,-1,-1,0,0,0,1,1,2,2,2]
        else:
            self.z_list = [z] * (self.V + 1)
            self.z = cl.factor_sizes_to_factor_labels(
                self.z_list)  # e.g. 3 to [-1,-1,-1,0,0,0,1,1,2,2,2]
        #print self.z
        self.K = len(self.z)  # number of latent factors
        self.Z, self.z_unique = cl.membership_vector_to_indicator_matrix(
            self.z
        )  # binary, size K by V+1, self.Z[k,u]=1 indicates the k-th factor in class u.
示例#6
0
文件: ssmcnmf.py 项目: yifeng-li/mvmf
    def stability_selection(self,z=3,a_0s=[1e2,1e1,1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,1e-1,1e-2,1e-3],b_0s=[1e2,1e1,1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10],a_larges=[1e2,1e1,1,1e-1,1e-2],b_larges=[1e2,1e1,1,1e-1,1e-2],a_small=1e2,b_small=1e-30,ab_tied=True,mean_W=None,mean_H_large=1,mean_H_small=1e-32,num_samplings=1000,max_iter=200,
                            threshold_F=0.1,rank_method="Wilcoxon_rank_sum_test", maxbc=12, key_feature_mean_feature_value_threshold=1,key_feature_neglog10pval_threshold=10,max_num_feature_each_pattern=3,
                         compute_variational_lower_bound=False,variational_lower_bound_min_rate=1e-4,if_plot_lower_bound=False,if_plot_heatmap=False,
                         a_H_test=0.1,b_H_test=1e-10,dir_save="./",prefix="MCNMF_stability_selection",verb=False,rng=numpy.random.RandomState(1000)):
        """
        SS-MV-NMF to obtain the empirical probability matrix.
        
        INPUTS:
        z: integer, tuple of length V+1, list/numpy vector of size (K,), the labels of the factors (columns) in W. If z is a scalar, it means each view (including the ubi view) has z factors. If z is a tuple, e.g. (3,3,3,3), z[u] means the the u-th view has z[u] factors. If z is a list or numpy vector,e.g. [-1-1-1,0,0,1,1,1,2,2,2] where -1 means ubi view, z[k] means the k-th factor has label z[k].
        a_0s,b_0s, list of numpy vector, the predefined sets of the shape and rate parameters of Gamma for generating lambda of the exponential distribution of W.
        a_larges,b_larges: list or numpy vector, the predefined sets of the shape and rate parameters of Gamma for generating lambda of the exponential distribution of non-zero H blocks.
        a_small,b_small: float scalar, shape and rate parameters of Gamma for generating lambda of the exponential distribution of zero H blocks.
        ab_tied: bool, whether tie rate a to b. If True, b given in the input of this function is disregarded, and set b_0=mean_W*a_0, b_large=self.mean_H_large*a_large, b_small=self.mean_H_small*a_small.  
        mean_W: float scalar, the estimated mean value of W; If None, set it to mean(X).
        mean_H, float scalar, the estimated mean value of non-zero blocks in H; If None, set it to 1.
        mean_H_small: float scalar, the estimated eman value of zero blocks in H; If None, set it to 1e-32.
        num_samplings: integer, the number of samplings, i.e. the number of independent runs of MV-NMF.
        max_iter: integer, the maximal number of iterations allowed in MV-NMF.
        threshold_F: positive float scalar, used to generate the feature activity indicator matrix: F= F_mean >= (threshold_F * mean(F_mean)).
        rank_method: string, the method to rank the features within a pattern, can be one of {"mean_basis_value","mean_feature_value","Wilcoxon_rank_sum_test"}.
        maxbc: positive integer, the maximal number of views allowed to generate all possible binary codes as feature patterns.
        key_feature_mean_feature_value_threshold: positive integer, the lowest limit of the mean feature values when selecting key features.
        key_feature_neglog10pval_threshold: positive integer, the lower limit of the negative log10(pval) when selecting key features.
        max_num_feature_each_pattern: positive integer, the maximal number of key features allowed to be selected in each pattern. 
        compute_variational_lower_bound: bool, whether compute variational lower bounds.
        variational_lower_bound_min_rate: float, a tiny positive number, the threshold of the local mean change rate, below which the algorithm will terminate.
        if_plot_lower_bound: bool, whether plot the variational lower bound plot. 
        dir_save: string, path, e.g. "/home/yifeng/research/mf/mvmf_v1_1/results/", path to save the lower bound plot.
        perfix: string, prefix of the saved file name.
        verb: bool, whether plot the information for each iteration, including lower bound.
        rng: random number generator.

        OUTPUTS:
        E_W: numpy matrix, the expected basis matrix W.
        E_H: numpy matrix, the expected coefficient matrix H.
        training_time: total time spent including time of computing lower bound.
        training_time_L: time spent only for computing lower bound.
        """

        if isinstance(z,(list,tuple,numpy.ndarray)):
            self.z_str="".join(numpy.asarray(z,dtype=str))
        else:
            self.z_str=str(z)
        self.z=z

        self.a_0s=a_0s
        self.b_0s=b_0s
        self.a_larges=a_larges
        self.b_larges=b_larges
        self.a_small=a_small
        self.b_small=b_small
        self.max_iter=max_iter
        self.threshold_F=threshold_F
        self.compute_variational_lower_bound=compute_variational_lower_bound
        self.variational_lower_bound_min_rate=variational_lower_bound_min_rate
        self.if_plot_lower_bound=if_plot_lower_bound
        self.verb=verb
        
	self.training_times=[] # factorization time
        self.fs_times=[] # feature selection time
        self.training_times_L=[] # time computing lower bounds
        self.settings=[] # parameter settings
        self.E_W=0
        self.L_W=0
        self.F_mean=0

        all_combinations=[]
        if ab_tied:
            if mean_W is None:
                self.mean_X=numpy.mean(self.X)
                self.mean_W=self.mean_X
            else:
                self.mean_W=mean_W
            if mean_H_large is None:
                self.mean_H_large=1
            else:
                self.mean_H_large=mean_H_large
            if mean_H_small is None:
                self.mean_H_small=1e-32
            else:
                self.mean_H_small=mean_H_small
            b_smal=self.mean_H_small*a_small
            for a_0 in a_0s:
                for a_large in a_larges:
                    all_combinations.append((a_0,self.mean_W*a_0,a_large,self.mean_H_large*a_large))
        else:
            for a_0 in a_0s:
                for b_0 in b_0s:
                    for a_large in a_larges:
                        for b_large in b_larges:
                            all_combinations.append((a_0,b_0,a_large,b_large))
        
        for ns in range(num_samplings):
            print "The {0}-th run of MV-NMF...".format(ns)
            rng_ns=numpy.random.RandomState(ns)
            # sample data                    
            ind_cv=cl.kfold_cross_validation(self.y,k=2,shuffle=True,rng=rng_ns)
            ind_subsample=ind_cv==1
            X=self.X[:,ind_subsample]
            y=self.y[ind_subsample]
            # reorder the sampled data
            X,y,_=cl.sort_classes(numpy.transpose(X),y)
            X=numpy.transpose(X)
            # sample setting
            ind_setting=rng_ns.choice(len(all_combinations), size=1, replace=False)
            a_0,b_0,a_large,b_large=all_combinations[ind_setting]


            #a_small=a_small # when a_small>1, larger a_small, more symmetric lambda
            A=(a_large,a_small)
            #b_large=0.01 # when a>1 fixed, control rate of lambda, smaller b_large, larger lambda, wider lambda, narrower w
            #b_small=b_small # when a_small>1 fixed, control rate of lambda, smaller b_small, larger lambda, wider lambda, smaller h, narrower h
            B=(b_large,b_small)

            # current setting
            setting="a_0="+str(a_0)+"_b_0="+str(b_0)+"_a_large="+str(a_large)+"_b_large="+str(b_large)
            self.settings.append(setting)
            print "The current setting: " + setting

            # run the model
            self.model_fs=mcnmf.mcnmf(X,y,self.features)
            _,_,training_time,training_time_L=self.model_fs.factorize(z=z,a_0=a_0,b_0=b_0,A=A,B=B,max_iter=max_iter,compute_variational_lower_bound=compute_variational_lower_bound,variational_lower_bound_min_rate=variational_lower_bound_min_rate,if_plot_lower_bound=if_plot_lower_bound,dir_save=dir_save,prefix=prefix+"_"+setting,verb=verb,rng=rng)
            #trim_nonzero_portion=0.01
            #model_fs.trim(trim_nonzero_portion=trim_nonzero_portion,alpha=0.01,threshold_E_W=None,threshold_E_H=None)
            # feature selection, this procedure only need to have F, so no need to run the scoring procedures
            _,fs_time=self.model_fs.sel_feat(called_by_ssmcnmf_loop=True,threshold_F=threshold_F,rank_method=rank_method,maxbc=maxbc,key_feature_mean_feature_value_threshold=key_feature_mean_feature_value_threshold,key_feature_neglog10pval_threshold=key_feature_neglog10pval_threshold,max_num_feature_each_pattern=max_num_feature_each_pattern,header=numpy.unique(self.y),rng=rng)

            # record time
            self.training_times.append(round(training_time,2))
            self.fs_times.append(round(fs_time,2))
            self.training_times_L.append(round(training_time_L,2))

            # update E_W and F
            self.E_W=self.E_W + self.model_fs.E_W
            self.L_W=self.L_W + self.model_fs.L_W
            self.F_mean=self.F_mean + self.model_fs.F

            if if_plot_heatmap:
                self.model_fs.plot_heatmap(dir_save, prefix+"_"+setting, pattern="All",rank_method="mean_basis_value", unique_class_names=numpy.unique(self.y), width=10, height=10, fontsize=6, fmt="png",colormap="hot")
            
        # get average result
        self.E_W=self.E_W/num_samplings
        self.L_W=self.L_W/num_samplings
        # get empirical probability
        self.F_mean=self.F_mean/num_samplings
        # update the corresponding variables in model_fs
        self.model_fs.E_W=self.E_W
        self.model_fs.L_W=self.L_W
        self.model_fs.X=self.X
        self.model_fs.y=self.y
        self.model_fs.Y,self.model_fs.y_unique=cl.membership_vector_to_indicator_matrix(self.y)
        self.model_fs.N=len(self.y)     
        # final update E_H
        self.model_fs.learn_H_given_X_test_and_E_W(self.X,a_H_test=a_H_test,b_H_test=b_H_test,feature_selection=False,max_iter=max_iter,compute_variational_lower_bound=compute_variational_lower_bound,variational_lower_bound_min_rate=variational_lower_bound_min_rate,if_plot_lower_bound=if_plot_lower_bound,dir_save=dir_save,prefix=prefix+"_final_update_E_H",verb=verb,rng=rng)
        self.model_fs.E_H=self.model_fs.E_H_test
        self.model_fs.E_H_test=None
        self.E_H=self.model_fs.E_H
        # update time record
        self.model_fs.training_time=numpy.sum(self.training_times)+self.model_fs.test_time
        self.model_fs.training_time_L=numpy.sum(self.training_times_L)+self.model_fs.test_time_L
        self.model_fs.test_time=0
        self.model_fs.test_time_L=0
        
        #self.trim(trim_nonzero_portion=0.01,alpha=0.05,threshold_E_W=None,threshold_E_H=None) # trim if necessary

        # store settings
        self.settings=numpy.array(self.settings)
        print "Finished stability selection :)"
        return self.E_W,self.E_H,self.model_fs.training_time,self.model_fs.training_time_L
示例#7
0
                           dtype=int,
                           delimiter=",",
                           skiprows=1)
test_set_y = test_set_x[:, 0]
test_set_x = test_set_x[:, 1:]
test_set_x = test_set_x.transpose()

# limit the number of training set
#train_set_x=train_set_x[:,0:10000]
#train_set_y=train_set_y[0:10000]

num_train = train_set_x.shape[1]
num_test = test_set_x.shape[1]

# convert train_set_y to binary codes
train_set_y01, z_unique = cl.membership_vector_to_indicator_matrix(
    z=train_set_y, z_unique=range(10))
train_set_y01 = train_set_y01.transpose()
test_set_y01, _ = cl.membership_vector_to_indicator_matrix(z=test_set_y,
                                                           z_unique=range(10))
test_set_y01 = test_set_y01.transpose()

num_feat = train_set_x.shape[0]
visible_type = "Bernoulli"
hidden_type = "Gaussian"
hidden_type_fixed_param = 1
rng = numpy.random.RandomState(100)
M = num_feat
normalization_method = "None"

if visible_type == "Bernoulli":
    normalization_method = "scale"
示例#8
0
    def make_W(self, a_active_W=10, b_active_W=1000):
        """
        Make the real basis matrix W.
        a_active_W: scalar, shape parameter of Gamma distribution.
        b_active_W: scalar, rate parameter of Gamma distribution.
        """
        self.a_active_W = a_active_W
        self.b_active_W = b_active_W
        self.ubc = unique_binary_code.unique_binary_code(self.V + 1)
        self.ubc.generate_binary_code()
        self.ubc.s
        self.s_str = utility.convert_each_row_of_matrix_to_a_string(self.ubc.s,
                                                                    sep="")
        self.num_patterns = len(self.s_str)
        self.M = self.m * self.num_patterns
        self.W = numpy.zeros(shape=(self.M, self.K), dtype=float)
        self.Lambda_W = numpy.zeros(shape=(self.M, self.V + 1), dtype=float)
        ls = self.rng.gamma(shape=self.a_active_W,
                            scale=1 / self.b_active_W,
                            size=(self.M, self.V + 1))
        mp = cl.factor_sizes_to_factor_labels(
            [self.m] * self.num_patterns)  # [3,2,4] -> [-1,-1,-1,0,0,1,1,1]
        MP, _ = cl.membership_vector_to_indicator_matrix(mp)
        #print MP
        #print self.ubc.s
        self.S = numpy.dot(MP, self.ubc.s)  # extend binary codes, M times V+1
        self.S = numpy.asarray(self.S, dtype=bool)
        self.Lambda_W[self.S] = ls[self.S]
        #self.features=numpy.empty(shape=(self.M,),dtype=str)
        self.features = ["features"] * self.M  # names of features
        self.feature_patterns = ["feature_patterns"
                                 ] * self.M  # pattern of features
        self.feature_patterns_matrix = numpy.zeros(shape=(self.M, self.V + 1),
                                                   dtype=int)
        fs = range(0,
                   self.m) * self.num_patterns  # [0,1,2,0,1,2,0,1,2,...,0,1,2]
        #print self.Lambda_W
        for i in range(self.M):
            code = numpy.asarray(self.S[i, :], dtype=int)
            self.Z = numpy.asarray(self.Z, dtype=int)
            code.shape = (len(code), 1)  # V+1 times 1
            code_ext = self.Z.dot(code)  # K times 1
            code_ext = numpy.asarray(code_ext, dtype=bool)
            code_ext.shape = (len(code_ext), )
            code.shape = (len(code), )
            self.features[i] = "".join(numpy.asarray(
                code, dtype=str)) + "_" + str(fs[i])
            self.feature_patterns[i] = "".join(numpy.asarray(code, dtype=str))
            self.feature_patterns_matrix[i, :] = code
            code = numpy.asarray(code, dtype=bool)
            #num_active_views=numpy.sum(code)
            w = []
            for v in range(self.V + 1):
                if self.S[i, v]:
                    w = numpy.concatenate(
                        (w,
                         self.rng.exponential(scale=1 / self.Lambda_W[i, v],
                                              size=self.z_list[v])))
            self.W[i, code_ext] = w

        #print self.W
        #print self.features
        return self.W, self.features