Пример #1
0
def load_train_matrix(*,train_dv=None,train_coll_cls,stack_per_sample=3000):

    train_bows=None
    train_labels=[]

    matrix_cache=[]
    for count,train_bow_obj in enumerate(train_coll_cls.objects):
        if count %1000==0:
            print("Train load curr at:  {}".format(count))

        curr_bow_matrix=train_dv.transform(train_bow_obj.attr_map)[0]
        matrix_cache.append(curr_bow_matrix)
        train_labels.append(train_bow_obj.short_genre)

        if len(matrix_cache)>stack_per_sample:
            train_bows=sp.vstack(matrix_cache)
            matrix_cache=[train_bows]
            print("stacked, train bow size:{},labels size: {}".format(train_bows.shape[0],len(train_labels)))



    if len(matrix_cache)>1:
        print("stacking")
        train_bows=sp.vstack(matrix_cache)
        matrix_cache=[]

    print("Final training size: {}".format(train_bows.shape[0]))
    return train_bows,np.asarray(train_labels)
Пример #2
0
 def update(self, rclient, new):
     texts = []
     urls = self.load_urls("global")
     url_length = len(urls)
     sources = {}
     vectorizer = self.load_vectorizer()
     dst = os.path.join(self.pickles_directory, self.prefix)
     tmp = os.path.join(self.pickles_directory, self.tmp_prefix)
     with TmpDirectory(dst, tmp):
         logger.info("extrating data")
         for i, (text, url, source) in enumerate(
                 self.extract_data(rclient, new)):
             texts.append(text)
             urls.append(url)
             sources.setdefault(source, array('I')).append(i)
         logger.info("update global vectorizer")
         tfidf = vectorizer.transform(texts)
         tfidf = sp.vstack((self.load_tfidf("global"), tfidf), format='csr')
         self.save_tfidf(tfidf, "global")
         self.save_urls(urls, "global")
         for source, indices in sources.iteritems():
             logger.info("update %s vectorizer", source)
             source_texts = (texts[i] for i in indices)
             source_urls = self.load_urls(source) + \
                 [urls[i + url_length] for i in indices]
             tfidf = vectorizer.transform(source_texts)
             tfidf = sp.vstack((self.load_tfidf(source), tfidf),
                               format='csr')
             self.save_tfidf(tfidf, source)
             self.save_urls(source_urls, source)
Пример #3
0
def crawl(addr_in):
    print "Processing {}".format(addr_in)
    path_in = os.path.join(addr_in, "output_bin.npy")
    with open(path_in, "r") as file_in:
        X = smio.load_sparse_csr(file_in)

    path_out_pos = os.path.join(addr_in, "output_bin_pos.npy")
    path_out_neg = os.path.join(addr_in, "output_bin_neg.npy")

    list_pos = []
    list_neg = []
    for line in X:
        res = line[np.size(X, 1)-1]
        if res == 1:
            list_pos.append(csr_matrix(line))
        else:
            list_neg.append(csr_matrix(line))
    X_pos = vstack(list_pos)
    X_neg = vstack(list_neg)

    file_pos = open(path_out_pos, "w")
    smio.save_sparse_csr(file_pos, X_pos)
    file_pos.close()
    file_neg = open(path_out_neg, "w")
    smio.save_sparse_csr(file_neg, X_neg)
    file_neg.close()
Пример #4
0
 def evalDeriv(self, prob, v=None, adjoint=False):
     if prob._formulation == 'HJ':
         if adjoint:
             if self.modelType == "Head":
                 srcDeriv = - prob.hDeriv.T * prob.Grad.T * self.MfLiI.T * (prob.Div.T * v)
             elif self.modelType == "CurrentSource":
                 srcDeriv = prob.qDeriv.T * v
             elif self.modelType == "CurrentDensity":
                 jsDeriv = sp.vstack((prob.jsxDeriv, prob.jsyDeriv, prob.jszDeriv))
                 srcDeriv = - jsDeriv.T * prob.mesh.aveF2CCV * (prob.Div.T*v)
             else:
                 raise NotImplementedError()
         else:
             if self.modelType == "Head":
                 srcDeriv = -prob.Div*self.MfLiI*prob.Grad*(prob.hDeriv*v)
             elif self.modelType == "CurrentSource":
                 srcDeriv = prob.qDeriv * v
             elif self.modelType == "CurrentDensity":
                 jsDeriv = sp.vstack((prob.jsxDeriv, prob.jsyDeriv, prob.jszDeriv))
                 srcDeriv = -prob.Div*prob.mesh.aveF2CCV.T*(jsDeriv*v)
             else:
                 raise NotImplementedError()
     elif prob._formulation == 'EB':
         raise NotImplementedError()
     return srcDeriv
    def fit_simplified(self, x_train, y_train):
        c_training_examples = []
        c_training_scores = []
        h_training_examples = []
        h_training_scores = []

        start_time = time.clock()
        print "Number of examples in training set: " + str(len(x_train))
        for i in xrange(len(x_train)):
            flipbit = FlipBit(x_train[i], self.number_of_labels, self.scoring_function, true_output=y_train[i])
            outputs = flipbit.greedy_search(self.depth_of_search)
            h_training_examples.extend(flipbit.get_training_examples())
            h_training_scores.extend(flipbit.get_training_scores())

            for j in xrange(len(outputs)):
                example = construct_sparse_attributes(x_train[i], outputs[j])
                score = calculate_loss(self.scoring_function, outputs[j], y_train[i], self.number_of_labels)
                c_training_examples.append(example)
                c_training_scores.append(score)

        generating_end_time = time.clock()

        self.h_regressor.fit(vstack(h_training_examples, format='csr'), h_training_scores)
        print "Number of H regression learning examples: " + str(len(h_training_examples))

        self.c_regressor.fit(vstack(c_training_examples, format='csr'), c_training_scores)
        print "Number of C regression learning examples: " + str(len(c_training_examples))

        fit_time = time.clock()

        construction_time = (generating_end_time - start_time)
        learning_time = (fit_time - generating_end_time)
        print("Construction time: {0:.4f}, Learning HC time: {1:.4f}".format(construction_time, learning_time))
def concatNpysIntoOneBigMatrix(npy_list):

    print "Start reading eval images..."

    result = None
    tmpArr = None
    count = 0

    for npy in npy_list:
        if( (count+1) % 1000 == 0):
            sys.stderr.write(str(count+1) + " eval images loaded\n")

        # convert to sparse format
        # csr means row major
        f = csr_matrix(np.load(npy), dtype='float64')
        # concat into one big sparse matrix
        if result is None:
            result = f
        else:
            if(tmpArr == None):
                tmpArr = f
            # merge into main array every 1000 npy loaded (hope it could make the IO less)
            else:
                tmpArr = vstack([tmpArr, f])
            if( count % 1000 == 0 ):
                result = vstack([result, tmpArr])             
                tmpArr = None
        count += 1
    result = vstack([result, tmpArr])
    print result.shape
    return result
Пример #7
0
def maketrain():
	for idx, im in enumerate(trainimages):
		if (idx % 100) == 0:
			print idx, im
		for j in jab[int(im[21:27])]:
			cap_train.append((j, idx))	
	with open(path+'/coco_align.train.pkl', 'wb') as f:
		cPickle.dump(cap_train, f)


	sp = []
	for idx, im in enumerate(trainimages):
		data = loadmat('../coco_cnn4/'+im)
		sp.append(csr_matrix(numpy.asarray(data['o24'])))
		if (idx % 10000) == 9999:
			print idx
			with open(path+'/train.pkl'+str(idx+1), 'wb') as f:
				cPickle.dump(vstack(sp), f, protocol=cPickle.HIGHEST_PROTOCOL)
			sp = []
	
	with open(path+'/train.pkl'+str(idx+1), 'wb') as f:
	    cPickle.dump(vstack(sp), f, protocol=cPickle.HIGHEST_PROTOCOL)

	#COCO_train2014_000000286899.jpg
	return 0
Пример #8
0
  def blockLiftingAnalysis(self):
    """
    Build, lift and analyze block matrix form of the collected LPs.
    """
      
    # Extract lists of matrices for each LP element from the collected LPs.
    # The coo_matrix calls are needed because vectors are delivered in dense format
    # by the (block) grounder and the stacking below needs a sparse representation.
    am = [ x["a"] for x in self.ground]
    bm = [ sp.coo_matrix(x["b"]) for x in self.ground]
    cm = [ sp.coo_matrix(x["c"]) for x in self.ground]
    gm = [ x["g"] for x in self.ground]
    hm = [ sp.coo_matrix(x["h"]) for x in self.ground]

    # stack it
    block_a = sp.block_diag(am)
    block_b = sp.vstack(bm)
    block_c = sp.vstack(cm)
    block_g = sp.block_diag(gm)
    block_h = sp.vstack(hm)

    # lift it
    ground = mdict(block_a,block_b,block_c,block_g,block_h)
    lifted = lift(ground, self.sparse, self.orbits)

    # say it
    print >> self.report, "BLOCK LP LIFTING"
    reportToFile(self.report,ground,lifted, self.dumpBlockMatrix)
Пример #9
0
	def __init__(self, Y, a, b, c, d, H, q=None):
		'''
		Q_{i,j} = a*(y_i*y_j)^3+b*(y_i*y_j)^2+c*(y_i+y_j) + d - h_i*h_j
		q = q
		'''
		super(AMF_deg3_BQP, self).__init__()
		n, l =  Y.shape
		self.a = a
		self.b = b
		self.c = c
		self.d = d
		self.Y1 = Y
		self.H = H # mostly updated
		if q is None:
			self.q = np.zeros(n)
		else:
			self.q = q
		# consider 2nd and 3nd power of Y
		Y2_tmp = []
		Y3_tmp = []
		for i in xrange(n):
			y = Y.getrow(i)
			Y2_tmp.append(kron(y, y))
			Y3_tmp.append(kron(kron(y, y), y))
		self.Y2 = vstack(Y2_tmp).tocsr()
		self.Y3 = vstack(Y3_tmp).tocsr()
Пример #10
0
    def getInterpolationMatCartMesh(self, Mrect, locType="CC", locTypeTo=None):
        """
            Takes a cartesian mesh and returns a projection to translate onto
            the cartesian grid.
        """

        assert self.isSymmetric, (
            "Currently we have not taken into account " "other projections for more complicated " "CylMeshes"
        )

        if locTypeTo is None:
            locTypeTo = locType

        if locType == "F":
            # do this three times for each component
            X = self.getInterpolationMatCartMesh(Mrect, locType="Fx", locTypeTo=locTypeTo + "x")
            Y = self.getInterpolationMatCartMesh(Mrect, locType="Fy", locTypeTo=locTypeTo + "y")
            Z = self.getInterpolationMatCartMesh(Mrect, locType="Fz", locTypeTo=locTypeTo + "z")
            return sp.vstack((X, Y, Z))
        if locType == "E":
            X = self.getInterpolationMatCartMesh(Mrect, locType="Ex", locTypeTo=locTypeTo + "x")
            Y = self.getInterpolationMatCartMesh(Mrect, locType="Ey", locTypeTo=locTypeTo + "y")
            Z = Utils.spzeros(getattr(Mrect, "n" + locTypeTo + "z"), self.nE)
            return sp.vstack((X, Y, Z))

        grid = getattr(Mrect, "grid" + locTypeTo)
        # This is unit circle stuff, 0 to 2*pi, starting at x-axis, rotating
        # counter clockwise in an x-y slice
        theta = -np.arctan2(grid[:, 0] - self.cartesianOrigin[0], grid[:, 1] - self.cartesianOrigin[1]) + np.pi / 2
        theta[theta < 0] += np.pi * 2.0
        r = ((grid[:, 0] - self.cartesianOrigin[0]) ** 2 + (grid[:, 1] - self.cartesianOrigin[1]) ** 2) ** 0.5

        if locType in ["CC", "N", "Fz", "Ez"]:
            G, proj = np.c_[r, theta, grid[:, 2]], np.ones(r.size)
        else:
            dotMe = {
                "Fx": Mrect.normals[: Mrect.nFx, :],
                "Fy": Mrect.normals[Mrect.nFx : (Mrect.nFx + Mrect.nFy), :],
                "Fz": Mrect.normals[-Mrect.nFz :, :],
                "Ex": Mrect.tangents[: Mrect.nEx, :],
                "Ey": Mrect.tangents[Mrect.nEx : (Mrect.nEx + Mrect.nEy), :],
                "Ez": Mrect.tangents[-Mrect.nEz :, :],
            }[locTypeTo]
            if "F" in locType:
                normals = np.c_[np.cos(theta), np.sin(theta), np.zeros(theta.size)]
                proj = (normals * dotMe).sum(axis=1)
            if "E" in locType:
                tangents = np.c_[-np.sin(theta), np.cos(theta), np.zeros(theta.size)]
                proj = (tangents * dotMe).sum(axis=1)
            G = np.c_[r, theta, grid[:, 2]]

        interpType = locType
        if interpType == "Fy":
            interpType = "Fx"
        elif interpType == "Ex":
            interpType = "Ey"

        Pc2r = self.getInterpolationMat(G, interpType)
        Proj = Utils.sdiag(proj)
        return Proj * Pc2r
Пример #11
0
    def build_file_data(self, class_names, features_file):

        for c in class_names:
            t0 = time.time()
            train_p, train_n, test_p, test_n = self.data_obj.split_by_class(c)
            t1 = time.time()
            print("Split by class time: ", t1 - t0,"s")
            
            numntrain = train_n.shape[0]
            numptrain = train_p.shape[0]
            X = sparse.vstack([train_p, train_n])

            y = [1]*numptrain + [0]*numntrain
            if self.split_type == "CROSSVALIDATION":
                ftr_file_name = 'features/data_%s_%s' % (c, features_file)
            else:
                ftr_file_name = 'features/train_%s_%s' % (c, features_file)

            print('Writing %s...' % ftr_file_name)
            self.write_data(ftr_file_name, X, y)

            numntest = test_n.shape[0]
            numptest = test_p.shape[0]
            X =  sparse.vstack([test_p, test_n])
            y = [1]*numptest + [0]*numntest
            if self.split_type == "CROSSVALIDATION":
                ftr_file_name = 'features/data_%s_%s' % (c, features_file)
            else:
                ftr_file_name = 'features/test_%s_%s' % (c, features_file)
            print('Writing %s...' % ftr_file_name)
            self.write_data(ftr_file_name, X, y)
def eval_jac_g(x, flag, user_data=None):
    """Calculates the Jacobi matrix.

    If the flag is true, returns a tuple (row, col) to indicate the
    sparse Jacobi matrix's structure.
    If the flag is false, returns the values of the Jacobi matrix
    with length nnzj.
    """
    Js = user_data['Js']
    if flag:
        return (Js.row, Js.col)
    else:
        om    = user_data['om']
        Ybus  = user_data['Ybus']
        Yf    = user_data['Yf']
        Yt    = user_data['Yt']
        ppopt = user_data['ppopt']
        il    = user_data['il']
        A     = user_data['A']

        _, _, dhn, dgn = opf_consfcn(x, om, Ybus, Yf, Yt, ppopt, il)

        if A is not None and issparse(A):
            J = vstack([dgn.T, dhn.T, A], 'coo')
        else:
            J = vstack([dgn.T, dhn.T], 'coo')

        ## FIXME: Extend PyIPOPT to handle changes in sparsity structure
        nnzj = Js.nnz
        Jd = zeros(nnzj)
        Jc = J.tocsc()
        for i in range(nnzj):
            Jd[i] = Jc[Js.row[i], Js.col[i]]

        return Jd
Пример #13
0
 def generate_offspring(self,vectorpopulation,parameterpopulation,parameter_options,fitness,elite='0.1',tournament_size=2,crossover_prob=0.9,n_crossovers=1,mutation_rate=0.3,win_condition='highest'):
     fitness_numbered = [[i,x] for i,x in enumerate(fitness)]
     fitness_sorted = sorted(fitness_numbered,key = lambda k : k[1],reverse=True) if win_condition == 'highest' else sorted(fitness_numbered,key = lambda k : k[1])
     new_population = [vectorpopulation[x[0],:] for x in fitness_sorted[:int(elite*vectorpopulation.shape[0])]]
     new_parameterpopulation = [parameterpopulation[x[0]] for x in fitness_sorted[:int(elite*vectorpopulation.shape[0])]]
     fitness_candidates = fitness_sorted[int(elite*vectorpopulation.shape[0]):]
     while len(new_population) < vectorpopulation.shape[0]:
         # select
         selections = self.tournament_selection(fitness_candidates,tournament_size,win_condition)
         parents = vectorpopulation[selections,:]
         parameterparents = parameterpopulation[selections,:]
         # generate and mutate
         if random.random() < crossover_prob:
             offspring = []
             paramoffspring = []
             for generation in range(2):
                 child = self.offspring_crossover(parents,n_crossovers)
                 child_mutated = self.mutate(child,mutation_rate)
                 while child_mutated.count_nonzero() == 0:
                     child_mutated = self.mutate(child,mutation_rate)
                 offspring.append(child_mutated)
                 paramoffspring.append(self.random_parameterpopulation(parameter_options, 1)[0])
         else:
             offspring = parents
             paramoffspring = parameterparents
         # accept
         new_population.extend(offspring)
         new_parameterpopulation.extend(paramoffspring)
     return sparse.vstack(new_population), sparse.vstack(new_parameterpopulation)
Пример #14
0
def cluster(train_data, test_data, tag_matrix, k):
    km = MiniBatchKMeans(k)
    training_labels = km.fit_predict(train_data)
    testing_labels = km.predict(test_data)
    training_matrices = []
    testing_matrices = []
    tag_matrices = []
    for i in xrange(k):
        train_rows = [train_data.getrow(j) for j in xrange(train_data.shape[0]) if training_labels[j] == i]
        test_rows = [test_data.getrow(j) for j in xrange(test_data.shape[0]) if testing_labels[j] == i]
        if len(train_rows) == 0:
            training_matrices.append(sparse.csr_matrix((1,1)))
            testing_matrices.append(sparse.csr_matrix((1,1)))
            tag_matrices.append(tag_matrix.getrow(0)-tag_matrix.getrow(0))
            continue
        training_matrices.append(sparse.vstack(train_rows))
        if len(test_rows) == 0:
            testing_matrices.append(sparse.csr_matrix((1,1)))
            tag_matrices.append(tag_matrix.getrow(0)-tag_matrix.getrow(0))
            continue
        testing_matrices.append(sparse.vstack(test_rows))
        ktags = sum([tag_matrix.getrow(j) for j in xrange(tag_matrix.shape[0]) if training_labels[j] == i])
        for j in xrange(len(ktags.data)):
            ktags.data[j] /= ktags.data[j]
        tag_matrices.append(ktags)
    tag_matrix = sparse.vstack(tag_matrices)
    save_matrix('tag_matrix.txt', tag_matrix)
    for i in xrange(k):
        save_matrix('training_matrix_%d.txt' % i, training_matrices[i].tocoo())
        save_matrix('testing_matrix_%d.txt' % i, testing_matrices[i].tocoo())
    
    predictions = []
    for i in xrange(len(testing_labels)):
        predictions.append(tag_matrices[testing_labels[i]])
    return sparse.vstack(predictions)
Пример #15
0
def multiclass_to_ranking(X, y):
    n_classes = y.shape[1]
    n_samples = X.shape[0]

    # create extended X matrix
    X_features = X.copy()
    for i in range(n_classes - 1):
        X_features = sp.vstack([X_features, X])

    X_labels = None
    for i in range(n_classes):
        X_tmp = sp.csc_matrix((n_samples, n_classes))
        X_tmp[:, i] = 1
        if X_labels is not None:
            X_labels = sp.vstack([X_labels, X_tmp])
        else:
            X_labels = X_tmp

    X_ext = sp.hstack([X_labels, X_features])

    # create all combinations
    compars = []

    for i_row, row in enumerate(y.tocsr()):
        # over all true labels
        for i in row.indices:
            for c in range(n_classes):
                if c not in row.indices:
                    offset = i_row * n_classes
                    compars.append([offset + i, offset + c])

    compars = np.vstack(compars)
    compars = compars.astype(np.float64)
    return X_ext, compars
Пример #16
0
    def _get_aug_mat(self, k, j):
        """
        Generate the matrix [[A, E], [0, A]] where
            A is the overall dynamics generator
            E is the control dynamics generator
        for a given timeslot and control
        returns this augmented matrix
        """
        dyn = self.parent
        dg = dyn._get_phased_dyn_gen(k)

        if dyn.oper_dtype == Qobj:
            A = dg.data*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j).data*dyn.tau[k]
            Z = sp.csr_matrix(dg.data.shape)
            aug = Qobj(sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])]))
        elif dyn.oper_dtype == np.ndarray:
            A = dg*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k]
            Z = np.zeros(dg.shape)
            aug = np.vstack([np.hstack([A, E]), np.hstack([Z, A])])
        else:
            A = dg*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k]
            Z = dg*0.0
            aug = sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])])
        return aug
Пример #17
0
def block2full(ht,sparse=False):
  """Convert a heterostructure with block diagonal Hamiltonian
  into the full form"""
  if not ht.block_diagonal: return ht # stop
  ho = ht.copy()
  ho.block_diagonal = False # set in false from now on
  nb = len(ht.central_intra) # number of blocks
  lc = [csc_matrix(ht.central_intra[i][i].shape) for i in range(nb)]
  rc = [csc_matrix(ht.central_intra[i][i].shape) for i in range(nb)]
  lc[0] = csc_matrix(ht.left_coupling)
  rc[nb-1] = csc_matrix(ht.right_coupling)
  # convert the central to sparse form
  central = [[None for i in range(nb)] for j in range(nb)]
  for i in range(nb):
    for j in range(nb):
      if ht.central_intra[i][j] is None: continue
      else:
        central[i][j] = csc_matrix(ht.central_intra[i][j])
  from scipy.sparse import vstack
  if sparse:
    ho.left_coupling = vstack(lc)
    ho.right_coupling = vstack(rc)
    ho.central_intra = bmat(ht.central_intra) # as sparse matrix
  else:
    ho.left_coupling = vstack(lc).todense()
    ho.right_coupling = vstack(rc).todense()
    ho.central_intra = bmat(central).todense() # as dense matrix
  return ho
Пример #18
0
   def _split_data(self, factor, factor_null_val):
      """
         Splits self.data into two sparse matrices.

         Arguments:

         Returns:
          A tuple of two sparse matrices containing rows for which
          factor's 'factor' values:
          (1) equal 'factor_null_val'
          (2) do not equal 'factor_null_val'

      """
      fac_len, rows_len = self.fac_len, self.data.shape[0]
      fac_ind = self.col_names.index(factor)

      non_null_set = []
      null_set = []

      m_csr = self.data.tocsr()
      for row_ind in range(rows_len):
         arow = np.ravel(m_csr.getrow(row_ind).todense())
         if arow[fac_ind] == factor_null_val:
            null_set.append(m_csr.getrow(row_ind))
         else:
            non_null_set.append(m_csr.getrow(row_ind))

      return (sparse.vstack(null_set).tolil(),
              sparse.vstack(non_null_set).tolil())
Пример #19
0
 def integrateObservation(self, obs):
     """This method stores the observation inside the agent"""
     start_time = time.time()
     self.obs = obs
     if (len(obs) != 8):
         self.isEpisodeOver = True
     else:
         self.mayMarioJump, self.isMarioOnGround, self.marioFloats, self.enemiesFloats, self.levelScene, dummy,action,self.obsArray = obs
         self.obsArray = csr_matrix(self.obsArray)
         self.should_take_action = action
         if(self.count > 5):
             if(self.initialTraining):
                 self.actions = numpy.vstack((self.actions,numpy.array([action])))
                 self.states = vstack((self.states,self.prev_obs.T))
                 self.human_input += 1
             elif self.isLearning:
                 if self.count > 6 and action != self.actionTaken:
                     self.mistakes += 1
                     
                 if((self.actionTaken != action)):
                     self.actions = numpy.vstack((self.actions,numpy.array([action])))
                     self.states = vstack((self.states,self.prev_obs.T))
                     self.human_input += 1
                     
         self.human_input += 1
         self.prev_obs = self.obsArray            
         self.count += 1
Пример #20
0
def contiguous_train_test_split(X, y, train_size):

    N = len(y)
    N_train = np.int(N*train_size)

    start_train = np.random.randint(N)
    end_train = start_train + N_train


    if end_train <= N:
        print("start: " + str(start_train))
        print("end: " + str(end_train))
        print("total: " + str(N))
        X_train = X[start_train:end_train, :]
        y_train = y[start_train:end_train]

        X1 = X[:start_train, :]
        X2 = X[end_train:, :]
        X_test = sparse.vstack((X1, X2))
        y_test = np.r_[y[:start_train], y[end_train:]]

    else:
        end_train = end_train - N
        print("start: " + str(start_train))
        print("end: " + str(end_train))
        print("total: " + str(N))
        X1 = X[:end_train, :]
        X2 = X[start_train:, :]
        X_train = sparse.vstack((X1, X2))
        y_train = np.r_[y[:end_train], y[start_train:]]

        X_test = X[end_train:start_train, :]
        y_test = y[end_train:start_train]

    return X_train, X_test, y_train, y_test
Пример #21
0
    def Jfull(self, m=None, f=None):
        if f is None:
            f = self.fields(m)

        nn = len(f)-1
        Asubs, Adiags, Bs = list(range(nn)), list(range(nn)), list(range(nn))
        for ii in range(nn):
            dt = self.timeSteps[ii]
            bc = self.getBoundaryConditions(ii, f[ii])
            Asubs[ii], Adiags[ii], Bs[ii] = self.diagsJacobian(
                m, f[ii], f[ii+1], dt, bc
            )
        Ad = sp.block_diag(Adiags)
        zRight = Utils.spzeros(
            (len(Asubs)-1)*Asubs[0].shape[0], Adiags[0].shape[1]
        )
        zTop = Utils.spzeros(
            Adiags[0].shape[0], len(Adiags)*Adiags[0].shape[1]
        )
        As = sp.vstack((zTop, sp.hstack((sp.block_diag(Asubs[1:]), zRight))))
        A = As + Ad
        B = np.array(sp.vstack(Bs).todense())

        Ainv = self.Solver(A, **self.solverOpts)
        AinvB = Ainv * B
        z = np.zeros((self.mesh.nC, B.shape[1]))
        du_dm = np.vstack((z, AinvB))
        J = self.survey.deriv(f, du_dm_v=du_dm)  # not multiplied by v
        return J
def splitDataByClass(data,label,percentage = 0.3):
	category = np.unique(label)
	labeled = None
	unlabeled = None
	y_labeled = None
	y_unlabeled = None
	first = False	
	for c in category:
		split = np.nonzero(label == c)[0]
		sz = int(percentage*len(split))	
		choice = np.random.choice(split,sz,replace=False)
		remaining = np.setdiff1d(split,choice)
#		print(choice.shape,remaining.shape,split.shape)
		if first:
			#labeled = np.concatenate((labeled,data[choice,:]),axis = 0)
			labeled = vstack((labeled,data[choice,:]))
			unlabeled =vstack((unlabeled,data[remaining,:]))
			y_labeled = np.concatenate((y_labeled,label[choice]))
			y_unlabeled = np.concatenate((y_unlabeled,label[remaining]))
		else:
			labeled = data[choice,:]
			unlabeled = data[remaining,:]	
			y_labeled = label[choice]
			y_unlabeled = label[remaining]			
			first=True	
	return ((labeled,y_labeled),(unlabeled,y_unlabeled))	
Пример #23
0
    def init_params(self, x_labeled, x_unlabeled, y):
        self.L = y.shape[0]
        self.U = x_unlabeled.shape[0]

        if self.iprint:
            print('training SVM ...')
        self.clf.fit(x_labeled, y)
        if self.iprint:
            print('training SVM complete')

        self.support_vector = self.clf.support_vector
        self.bias = self.clf.bias
        self.alpha = self.clf.alpha_times_y

        self.C = np.zeros(self.L + 2*self.U + 1)
        self.C[1:self.L+1] = self.clf.C
        self.C[self.L+1:] = self.C_unlabel
        if self.sparse:
            x = sp.vstack((x_labeled, x_unlabeled))
            x = sp.vstack((x, x_unlabeled))
        else:
            x = np.vstack((x_labeled, x_unlabeled))
            x = np.vstack((x, x_unlabeled))

        y_all = np.append(1, y)
        y_all = np.append(y_all, np.ones(self.U))
        y_all = np.append(y_all, -np.ones(self.U))
        return x, y_all
Пример #24
0
def combine_matrix():
    
    X000 = [sio.loadmat(filein_name[:-4] + '0X000.mat')['X000'],
            sio.loadmat(filein_name[:-4] + '1X000.mat')['X000'],
            sio.loadmat(filein_name[:-4] + '2X000.mat')['X000']]

    X001 = [sio.loadmat(filein_name[:-4] + '0X001.mat')['X001'],
            sio.loadmat(filein_name[:-4] + '1X001.mat')['X001'],
            sio.loadmat(filein_name[:-4] + '2X001.mat')['X001']]

    X010 = [sio.loadmat(filein_name[:-4] + '0X010.mat')['X010'],
            sio.loadmat(filein_name[:-4] + '1X010.mat')['X010'],
            sio.loadmat(filein_name[:-4] + '2X010.mat')['X010']]

    X100 = [sio.loadmat(filein_name[:-4] + '0X100.mat')['X100'],
            sio.loadmat(filein_name[:-4] + '1X100.mat')['X100'],
            sio.loadmat(filein_name[:-4] + '2X100.mat')['X100']]
    

    X_000 = sp.vstack([X000[0],X000[1],X000[2]])
    X_001 = sp.vstack([X001[0],X001[1],X001[2]])
    X_010 = sp.vstack([X010[0],X010[1],X010[2]])
    X_100 = sp.vstack([X100[0],X100[1],X100[2]])
    print(X_000.shape)

    
    X_model_100 = sp.hstack([X_000,X_100])
    sio.savemat(filein_name[:-4] + 'X100-model.mat', {'X100':X_model_100})

    X_model_010 = sp.hstack([X_000,X_010])
    sio.savemat(filein_name[:-4] + 'X010-model.mat', {'X010':X_model_010})

    X_model_001 = sp.hstack([X_000,X_001])
    sio.savemat(filein_name[:-4] + 'X001-model.mat', {'X001':X_model_001})
Пример #25
0
    def initPatients(self, patientSet="train"):
        
        visitIDs = file(self.settings.find('./patients').attrib['src'])
        self.visitShelf = shelve.open(self.settings.find('./patients').attrib['shelf'])
        self.wordShelf = shelve.open(self.settings.find('./vocab').attrib['shelf'])
        
        start = int(filter(lambda s: s.attrib['name'] == "train", self.settings.findall('./patientSets/set'))[0].attrib['start'])
        end = int(filter(lambda s: s.attrib['name'] == "train", self.settings.findall('./patientSets/set'))[0].attrib['end'])

        visit_ids = [z.strip() for z in visitIDs.readlines()[start:end]]

        self.visitIDs = visit_ids
        print "reading in patients", len(visit_ids)

        print 'from shelve'
        sparse_X = []
        s = time.time()
        for i,v in enumerate(self.visitIDs):
            if i%1000 == 0:
                print i, time.time() - s
                if i > end:
                    break
            pat = self.visitShelf[v]
            pat['anchors'] = set()
            self.patients[v] = pat
            sparse_X.append(pat['sparse_X'])
    
        #print self.patients.keys()
        self.sparse_X = sparse.vstack(sparse_X, 'lil')

        self.train_patient_ids = visit_ids


        self.patientList = [self.patients[v] for v in self.visitIDs]
        self.patientIndex = dict(zip([pat['index'] for pat in self.patientList], xrange(len(self.patientList))))

        visitIDs.seek(0)
        start = int(filter(lambda s: s.attrib['name'] == "validate", self.settings.findall('./patientSets/set'))[0].attrib['start'])
        end = int(filter(lambda s: s.attrib['name'] == "validate", self.settings.findall('./patientSets/set'))[0].attrib['end'])
        visit_ids = [z.strip() for z in visitIDs.readlines()[start:end]]
        self.validate_patient_set = set(visit_ids)
        self.validate_patient_ids = visit_ids
        self.validate_patient_list = []
        print "reading in validate patients", len(visit_ids)

        print 'from shelve'
        sparse_X_validate = []
        s = time.time()
        for i,v in enumerate(visit_ids):
            if i%1000 == 0:
                print i, time.time() - s
                if i > end:
                    break
            pat = self.visitShelf[v]
            pat['anchors'] = set()
            self.patients[v] = pat
            self.validate_patient_list.append(pat)
            sparse_X_validate.append(pat['sparse_X'])
    
        self.sparse_X_validate = sparse.vstack(sparse_X_validate, 'lil')
Пример #26
0
def get_clean_data(rsa_file_path, rsa_format, ano_file_path, ano_format, meta_data, only_first_month = False):
    cmd_codes = meta_data['cmd']
    stay_type_codes = meta_data['stay_type']
    stay_complexity_codes = meta_data['stay_complexity']
    ano_data = list()
    exit_month_data = list()
    chunk = 1000
    sex_data_first_col = 0
    age_in_year_data_first_col = sex_data_first_col + 2
    age_in_day_data_first_col = age_in_year_data_first_col + formats.age_in_year_cols_count
    stay_length_data_first_col = age_in_day_data_first_col + formats.age_in_day_cols_count
    cmd_codes_first_col = stay_length_data_first_col + formats.stay_length_cols
    stay_type_codes_first_col = cmd_codes_first_col + len(cmd_codes)
    stay_complexity_codes_first_col = stay_type_codes_first_col + len(stay_type_codes)
    cols_count = stay_complexity_codes_first_col + len(stay_complexity_codes)
    np_data = np.zeros((chunk, cols_count), dtype=np.int)
    rsa_data = sparse.csr_matrix((0, cols_count))
    index = 0
    global_index = 0
    lines_count = 0
    with open(rsa_file_path) as rsa_file:
        with open(ano_file_path) as ano_file:
            while True:
                if index == chunk:
                    rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data)])
                    np_data.fill(0)
                    index = 0
                rsa_line = rsa_file.readline()
                ano_line = ano_file.readline()
                if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format):
                    rsa = rsa_tools.get_rsa(rsa_line, rsa_format)
                    exit_month = rsa['exit_month']
                    if only_first_month and exit_month != 1:
                        continue
                    exit_month_data.append(exit_month)
                    ano = ano_tools.get_ano(ano_line, ano_format, global_index)
                    ano_data.append(ano)
                    np_data[index, sex_data_first_col + rsa['sex']] = 1
                    np_data[index, age_in_year_data_first_col + rsa['age_in_year_cat']] = 1
                    np_data[index, age_in_day_data_first_col + rsa['age_in_day_cat']] = 1
                    np_data[index, stay_length_data_first_col + rsa['stay_length_cat']] = 1
                    if rsa['cmd'] != '':
                        np_data[index, cmd_codes_first_col + cmd_codes.index(rsa['cmd'])] = 1
                    if rsa['stay_type'] != '':
                        np_data[index, stay_type_codes_first_col + stay_type_codes.index(rsa['stay_type'])] = 1
                    if rsa['stay_complexity'] != '':
                        np_data[index, stay_complexity_codes_first_col + stay_complexity_codes.index(rsa['stay_complexity'])] = 1
                    index += 1
                    global_index += 1
                if lines_count % 10000 == 0:
                    print '\rPorcessed %s \t added %s' % (lines_count, global_index),
                lines_count += 1
                if not rsa_line and not ano_line:
                    break

            if index % chunk != 0:
                rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data[0:index, :])])
    return {'anos': ano_data,
     'rsas': rsa_data,
     'exit_month_data': exit_month_data}
Пример #27
0
    def extract_features(utterances):

        logger.info('Extracting features.')
        # This might be inefficient, because the space object is passed to the pool.
        X = pool.map(space_compose, ((u, composer) for u in utterances), chunksize=CHUNK_SIZE)
        logger.debug('Stacking %d rows.', len(X))
        X = vstack(X, format='csr')

        if concatinate_prev_utterace:
            logger.debug('Getting previous utterances.')
            # It is basically the same X, just shifted one row up.
            prev_X = vstack([csr_matrix((1, X.shape[1])), csr_matrix(X)[:-1]], format='csr')

            # Reset prev. utterance vectors to 0 for the first utterance in a conversation.
            prev_conversation_no = None
            for row, u in enumerate(utterances):
                conversation_no = u.conversation_no
                if conversation_no != prev_conversation_no:
                    prev_X.data[prev_X.indptr[row]:prev_X.indptr[row + 1]] = 0
                prev_conversation_no = conversation_no

            prev_X.eliminate_zeros()

            assert (X[0] == prev_X[1]).todense().all()

            logger.debug('Hstacking utterances with their previous utterances.')
            X = hstack([X, prev_X], format='csr')

        return X
Пример #28
0
 def compose_all(self, phrases):
     """
     Composes all `phrases` and returns all unigrams and `phrases` as a matrix. Does NOT store the composed vectors.
     Unigram vectors must be brought in by extending classes.
     :param phrases: iterable of `str` or `DocumentFeature`
     :return: a tuple of :
         1) `csr_matrix` containing all vectors, unigram and composed
         2) the columns (features) of the unigram space that was used for composition
         3) a row index- dict {Feature: Row}. Maps from a feature to the row in 1) where the vector for that
            feature is. Note: This is the opposite of what IO functions in discoutils expect
     """
     composable_phrases = [foo for foo in phrases if foo in self]
     logging.info('Composing... %s able to compose %d/%d phrases using %d unigrams',
                  self.name, len(composable_phrases), len(phrases), len(self.unigram_source.name2row))
     if not composable_phrases:
         raise ValueError('%s cannot compose any of the provided phrases' % self.name)
     new_matrix = sp.vstack(self.get_vector(foo) for foo in composable_phrases)
     old_len = len(self.unigram_source.name2row)
     all_rows = deepcopy(self.unigram_source.name2row)  # can't mutate the unigram datastructure
     for i, phrase in enumerate(composable_phrases):
         key = phrase if isinstance(phrase, str) else str(phrase)
         # phrase shouln't be in the unigram source.
         assert key not in all_rows
         all_rows[key] = i + old_len  # this will not append to all_rows if phrase is contained in unigram_source
     all_vectors = sp.vstack([self.unigram_source.matrix, new_matrix], format='csr')
     assert all_vectors.shape == (len(all_rows), len(self.unigram_source.columns)), 'Shape mismatch'
     return all_vectors, self.unigram_source.columns, all_rows
Пример #29
0
def kron_mat(lin_op):
    """Returns the coefficient matrix for KRON linear op.

    Parameters
    ----------
    lin_op : LinOp
        The conv linear op.

    Returns
    -------
    list of SciPy CSC matrix
        The matrix representing the Kronecker product.
    """
    constant = const_mat(lin_op.data)
    lh_rows, lh_cols = constant.shape
    rh_rows, rh_cols = lin_op.args[0].size
    # Stack sections for each column of the output.
    col_blocks = []
    for j in range(lh_cols):
        # Vertically stack A_{ij}Identity.
        blocks = []
        for i in range(lh_rows):
            blocks.append(constant[i, j]*sp.eye(rh_rows))
        column = sp.vstack(blocks)
        # Make block diagonal matrix by repeating column.
        col_blocks.append( sp.block_diag(rh_cols*[column]) )
    coeff = sp.vstack(col_blocks).tocsc()

    return [coeff]
Пример #30
0
def makePropertyTensor(M, tensor):
    if tensor is None:  # default is ones
        tensor = np.ones(M.nC)

    if isScalar(tensor):
        tensor = tensor * np.ones(M.nC)

    propType = TensorType(M, tensor)
    if propType == 1: # Isotropic!
        Sigma = sp.kron(sp.identity(M.dim), sdiag(mkvc(tensor)))
    elif propType == 2: # Diagonal tensor
        Sigma = sdiag(mkvc(tensor))
    elif M.dim == 2 and tensor.size == M.nC*3:  # Fully anisotropic, 2D
        tensor = tensor.reshape((M.nC,3), order='F')
        row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 2])))
        row2 = sp.hstack((sdiag(tensor[:, 2]), sdiag(tensor[:, 1])))
        Sigma = sp.vstack((row1, row2))
    elif M.dim == 3 and tensor.size == M.nC*6:  # Fully anisotropic, 3D
        tensor = tensor.reshape((M.nC,6), order='F')
        row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 3]), sdiag(tensor[:, 4])))
        row2 = sp.hstack((sdiag(tensor[:, 3]), sdiag(tensor[:, 1]), sdiag(tensor[:, 5])))
        row3 = sp.hstack((sdiag(tensor[:, 4]), sdiag(tensor[:, 5]), sdiag(tensor[:, 2])))
        Sigma = sp.vstack((row1, row2, row3))
    else:
        raise Exception('Unexpected shape of tensor')

    return Sigma
Пример #31
0
    r1 = sp.hstack((train_adj['adj_0_1'].transpose(), train_adj['adj_1_1'], train_adj['adj_1_2']), format="csr")
    r2 = sp.hstack((train_adj['adj_0_2'].transpose(), train_adj['adj_1_2'].transpose(), train_adj['adj_2_2']),
                   format="csr")
    super_mask = [[1, 1, 1], [0, 1, 1], [0, 0, 1]]
else:
    all_sub_adj, node_types, features, one_hot_labels = load_aminer()
    train_adj, train_mask, val_mask, test_mask = load_train_val_test2(all_sub_adj)
    n2 = train_adj['adj_0_2'].shape[1]
    n1 = train_adj['adj_0_1'].shape[1]
    empty_mat = sp.csr_matrix(np.zeros(shape=(n1, n2)))
    r0 = sp.hstack((train_adj['adj_0_0'], train_adj['adj_0_1'], train_adj['adj_0_2']), format="csr")
    r1 = sp.hstack((train_adj['adj_0_1'].transpose(), train_adj['adj_1_1'], empty_mat), format="csr")
    r2 = sp.hstack((train_adj['adj_0_2'].transpose(), empty_mat.transpose(), train_adj['adj_2_2']), format="csr")
    super_mask = [[1, 1, 1], [0, 1, 0], [0, 0, 1]]

train_adj = sp.vstack((r0, r1, r2))
n_nodes = train_adj.shape[0]
n_features = features.shape[1]
n_types = node_types.shape[1]
n_labels = one_hot_labels.shape[1]

if FLAGS.model == 'gcn':
    support = [preprocess_adj(train_adj)]
    n_supports = 1
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(train_adj, FLAGS.max_degree)
    n_supports = 1 + FLAGS.max_degree
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

print('Supports Created!')
Пример #32
0
            n_hidden_2 = 50

            print "Building positive and negative report matrices..."

            pos_reports = io.mmread('model_0_posreports.mtx')
            pos_reports = pos_reports.tocsr()

            neg_reports = io.mmread('model_0_negreports.mtx')
            neg_reports = neg_reports.tocsr()

            for reportblock in range(1, 50):
                print "Procesing", reportblock
                thispos = io.mmread('model_' + str(reportblock) +
                                    '_posreports.mtx')
                thispos = thispos.tocsr()
                pos_reports = vstack((pos_reports, thispos))

                thisneg = io.mmread('model_' + str(reportblock) +
                                    '_negreports.mtx')
                thisneg = thisneg.tocsr()
                neg_reports = vstack((neg_reports, thisneg))

            print "Done."

            neg_ind = np.arange(neg_reports.shape[0])
            pos_ind = np.arange(pos_reports.shape[0])

            subset_neg_ind = np.random.choice(neg_ind,
                                              pos_reports.shape[0],
                                              replace=False)
            neg_reports_subset = neg_reports[subset_neg_ind, :]
Пример #33
0
def fetch_20newsgroups_vectorized(subset="train",
                                  remove=(),
                                  data_home=None,
                                  download_if_missing=True,
                                  return_X_y=False):
    """Load the 20 newsgroups dataset and vectorize it into token counts \
(classification).

    Download it if necessary.

    This is a convenience function; the transformation is done using the
    default settings for
    :class:`sklearn.feature_extraction.text.CountVectorizer`. For more
    advanced usage (stopword filtering, n-gram extraction, etc.), combine
    fetch_20newsgroups with a custom
    :class:`sklearn.feature_extraction.text.CountVectorizer`,
    :class:`sklearn.feature_extraction.text.HashingVectorizer`,
    :class:`sklearn.feature_extraction.text.TfidfTransformer` or
    :class:`sklearn.feature_extraction.text.TfidfVectorizer`.

    =================   ==========
    Classes                     20
    Samples total            18846
    Dimensionality          130107
    Features                  real
    =================   ==========

    Read more in the :ref:`User Guide <20newsgroups_dataset>`.

    Parameters
    ----------
    subset : 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.

    remove : tuple
        May contain any subset of ('headers', 'footers', 'quotes'). Each of
        these are kinds of text that will be detected and removed from the
        newsgroup posts, preventing classifiers from overfitting on
        metadata.

        'headers' removes newsgroup headers, 'footers' removes blocks at the
        ends of posts that look like signatures, and 'quotes' removes lines
        that appear to be quoting another post.

    data_home : optional, default: None
        Specify an download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : optional, True by default
        If False, raise an IOError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : boolean, default=False.
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

        .. versionadded:: 0.20

    Returns
    -------
    bunch : Bunch object with the following attribute:
        - bunch.data: sparse matrix, shape [n_samples, n_features]
        - bunch.target: array, shape [n_samples]
        - bunch.target_names: a list of categories of the returned data,
          length [n_classes].
        - bunch.DESCR: a description of the dataset.

    (data, target) : tuple if ``return_X_y`` is True

        .. versionadded:: 0.20
    """
    data_home = get_data_home(data_home=data_home)
    filebase = '20newsgroup_vectorized'
    if remove:
        filebase += 'remove-' + ('-'.join(remove))
    target_file = _pkl_filepath(data_home, filebase + ".pkl")

    # we shuffle but use a fixed seed for the memoization
    data_train = fetch_20newsgroups(data_home=data_home,
                                    subset='train',
                                    categories=None,
                                    shuffle=True,
                                    random_state=12,
                                    remove=remove,
                                    download_if_missing=download_if_missing)

    data_test = fetch_20newsgroups(data_home=data_home,
                                   subset='test',
                                   categories=None,
                                   shuffle=True,
                                   random_state=12,
                                   remove=remove,
                                   download_if_missing=download_if_missing)

    if os.path.exists(target_file):
        X_train, X_test = _joblib.load(target_file)
    else:
        vectorizer = CountVectorizer(dtype=np.int16)
        X_train = vectorizer.fit_transform(data_train.data).tocsr()
        X_test = vectorizer.transform(data_test.data).tocsr()
        _joblib.dump((X_train, X_test), target_file, compress=9)

    # the data is stored as int16 for compactness
    # but normalize needs floats
    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)
    normalize(X_train, copy=False)
    normalize(X_test, copy=False)

    target_names = data_train.target_names

    if subset == "train":
        data = X_train
        target = data_train.target
    elif subset == "test":
        data = X_test
        target = data_test.target
    elif subset == "all":
        data = sp.vstack((X_train, X_test)).tocsr()
        target = np.concatenate((data_train.target, data_test.target))
    else:
        raise ValueError("%r is not a valid subset: should be one of "
                         "['train', 'test', 'all']" % subset)

    module_path = dirname(__file__)
    with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file:
        fdescr = rst_file.read()

    if return_X_y:
        return data, target

    return Bunch(data=data,
                 target=target,
                 target_names=target_names,
                 DESCR=fdescr)
Пример #34
0
def solve_loop(qp_matrices, solver='osqp'):
    """
    Solve portfolio optimization loop for all gammas
    """
    # Shorter name for qp_matrices
    qp = qp_matrices

    # Get dimensions
    n = len(qp.lx)
    k = len(qp.l) - 1

    print('n = %d and solver %s' % (n, solver))

    # Get number of problems to solve
    n_prob = qp.q.shape[1]

    # Initialize time vector
    time = np.zeros(n_prob)

    # Initialize number of iterations vector
    niter = np.zeros(n_prob)

    if solver == 'osqp':
        # Construct qp matrices
        Aosqp = spa.vstack(
            (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc()
        losqp = np.append(qp.l, qp.lx)
        uosqp = np.append(qp.u, qp.ux)

        # Setup OSQP
        m = osqp.OSQP()
        m.setup(qp.P,
                qp.q[:, 0],
                Aosqp,
                losqp,
                uosqp,
                auto_rho=True,
                polish=False,
                verbose=False)

        for i in range(n_prob):
            q = qp.q[:, i]

            # Update linear cost
            m.update(q=q)

            # Solve
            results = m.solve()
            x = results.x
            y = results.y
            status = results.info.status_val
            niter[i] = results.info.iter
            time[i] = results.info.run_time

            # Check if status correct
            if status != m.constant('OSQP_SOLVED'):
                import ipdb
                ipdb.set_trace()
                raise ValueError('OSQP did not solve the problem!')

            # DEBUG
            # solve with gurobi
            # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp)
            # res = prob.solve(solver=mpbpy.GUROBI, verbose=False)
            # print('Norm difference OSQP-GUROBI %.3e' %
            #       np.linalg.norm(x - res.x))
            # import ipdb; ipdb.set_trace()

    elif solver == 'osqp_coldstart':
        # Construct qp matrices
        Aosqp = spa.vstack(
            (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc()
        losqp = np.append(qp.l, qp.lx)
        uosqp = np.append(qp.u, qp.ux)

        # Setup OSQP
        m = osqp.OSQP()
        m.setup(qp.P,
                qp.q[:, 0],
                Aosqp,
                losqp,
                uosqp,
                warm_start=False,
                auto_rho=True,
                polish=False,
                verbose=False)

        for i in range(n_prob):
            q = qp.q[:, i]

            # Update linear cost
            m.update(q=q)

            # Solve
            results = m.solve()
            x = results.x
            y = results.y
            status = results.info.status_val
            niter[i] = results.info.iter
            time[i] = results.info.run_time

            # Check if status correct
            if status != m.constant('OSQP_SOLVED'):
                import ipdb
                ipdb.set_trace()
                raise ValueError('OSQP did not solve the problem!')

            # DEBUG
            # solve with gurobi
            # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp)
            # res = prob.solve(solver=mpbpy.GUROBI, verbose=False)
            # print('Norm difference OSQP-GUROBI %.3e' %
            #       np.linalg.norm(x - res.x))
            # import ipdb; ipdb.set_trace()

        # DEBUG print iterations per value of gamma
        # gamma_vals = np.logspace(-2, 2, 101)[::-1]
        #
        # import matplotlib.pylab as plt
        # plt.figure()
        # ax = plt.gca()
        # plt.plot(gamma_vals, niter)
        # ax.set_xlabel(r'$\gamma$')
        # ax.set_ylabel(r'iter')
        # plt.show(block=False)

        # import ipdb; ipdb.set_trace()

    elif solver == 'osqp_no_caching':
        # Construct qp matrices
        Aosqp = spa.vstack(
            (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc()
        losqp = np.append(qp.l, qp.lx)
        uosqp = np.append(qp.u, qp.ux)

        for i in range(n_prob):

            # Setup OSQP
            m = osqp.OSQP()
            m.setup(qp.P,
                    qp.q[:, i],
                    Aosqp,
                    losqp,
                    uosqp,
                    warm_start=False,
                    auto_rho=True,
                    polish=False,
                    verbose=False)

            # Solve
            results = m.solve()
            x = results.x
            y = results.y
            status = results.info.status_val
            niter[i] = results.info.iter
            time[i] = results.info.run_time

            # Check if status correct
            if status != m.constant('OSQP_SOLVED'):
                import ipdb
                ipdb.set_trace()
                raise ValueError('OSQP did not solve the problem!')

            # DEBUG
            # solve with gurobi
            # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp)
            # res = prob.solve(solver=mpbpy.GUROBI, verbose=False)
            # print('Norm difference OSQP-GUROBI %.3e' %
            #       np.linalg.norm(x - res.x))
            # import ipdb; ipdb.set_trace()

        # DEBUG print iterations per value of gamma
        # gamma_vals = np.logspace(-2, 2, 101)[::-1]
        #
        # import matplotlib.pylab as plt
        # plt.figure()
        # ax = plt.gca()
        # plt.plot(gamma_vals, niter)
        # ax.set_xlabel(r'$\gamma$')
        # ax.set_ylabel(r'iter')
        # plt.show(block=False)

        # import ipdb; ipdb.set_trace()

    elif solver == 'qpoases':

        n_dim = qp.P.shape[0]  # Number of variables
        m_dim = qp.A.shape[0]  # Number of constraints without bounds

        # Initialize qpoases and set options
        qpoases_m = qpoases.PyQProblem(n_dim, m_dim)
        options = qpoases.PyOptions()
        options.printLevel = qpoases.PyPrintLevel.NONE
        qpoases_m.setOptions(options)

        # Construct bounds for qpoases
        lx = np.append(qp.lx, -np.inf * np.ones(k))
        ux = np.append(qp.ux, np.inf * np.ones(k))

        # Setup matrix P and A
        P = np.ascontiguousarray(qp.P.todense())
        A = np.ascontiguousarray(qp.A.todense())

        for i in range(n_prob):

            # Get linera cost as contiguous array
            q = np.ascontiguousarray(qp.q[:, i])

            # Reset cpu time
            qpoases_cpu_time = np.array([20.])

            # Reset number of of working set recalculations
            nWSR = np.array([1000])

            if i == 0:
                # First iteration
                res_qpoases = qpoases_m.init(P, q, A, np.ascontiguousarray(lx),
                                             np.ascontiguousarray(ux),
                                             np.ascontiguousarray(qp.l),
                                             np.ascontiguousarray(qp.u), nWSR,
                                             qpoases_cpu_time)
            else:
                # Solve new hot started problem
                res_qpoases = qpoases_m.hotstart(q, np.ascontiguousarray(lx),
                                                 np.ascontiguousarray(ux),
                                                 np.ascontiguousarray(qp.l),
                                                 np.ascontiguousarray(qp.u),
                                                 nWSR, qpoases_cpu_time)

            # # DEBUG Solve with gurobi
            # qpoases solution
            # sol_qpoases = np.zeros(n + k)
            # qpoases_m.getPrimalSolution(sol_qpoases)
            # import mathprogbasepy as mpbpy
            # Agrb = spa.vstack((qp.A,
            #                     spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))
            #                                ))).tocsc()
            # lgrb = np.append(qp.l, qp.lx)
            # ugrb = np.append(qp.u, qp.ux)
            # prob = mpbpy.QuadprogProblem(spa.csc_matrix(qp.P), q,
            #                              Agrb, lgrb, ugrb)
            # res = prob.solve(solver=mpbpy.GUROBI, verbose=True)
            # print("Norm difference x qpoases - GUROBI = %.4f" %
            #       np.linalg.norm(sol_qpoases - res.x))
            # print("Norm difference objval qpoases - GUROBI = %.4f" %
            #       abs(qpoases_m.getObjVal() - res.obj_val))
            # import ipdb; ipdb.set_trace()

            if res_qpoases != 0:
                raise ValueError('qpoases did not solve the problem!')

            # Save time
            time[i] = qpoases_cpu_time[0]

            # Save number of iterations
            niter[i] = nWSR[0]

    elif solver == 'gurobi':

        # Construct qp matrices
        Agurobi = spa.vstack(
            (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc()
        lgurobi = np.append(qp.l, qp.lx)
        ugurobi = np.append(qp.u, qp.ux)

        for i in range(n_prob):

            # Get linera cost as contiguous array
            q = qp.q[:, i]

            # Solve with gurobi
            prob = mpbpy.QuadprogProblem(qp.P, q, Agurobi, lgurobi, ugurobi)
            res = prob.solve(solver=mpbpy.GUROBI, verbose=False)

            # Save time
            time[i] = res.cputime

            # Save number of iterations
            niter[i] = res.total_iter

    elif solver == 'mosek':

        # Construct qp matrices
        Amosek = spa.vstack(
            (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc()
        lmosek = np.append(qp.l, qp.lx)
        umosek = np.append(qp.u, qp.ux)

        for i in range(n_prob):

            # Get linera cost as contiguous array
            q = qp.q[:, i]

            # Solve with mosek
            prob = mpbpy.QuadprogProblem(qp.P, q, Amosek, lmosek, umosek)
            res = prob.solve(solver=mpbpy.MOSEK, verbose=False)

            # Save time
            time[i] = res.cputime

            # Save number of iterations
            niter[i] = res.total_iter

    elif solver == 'ecos':

        for i in range(n_prob):
            # Construct the problem
            #       minimize	x' D x + y' I y - (1/gamma) * mu' x
            #       subject to  1' x = 1
            #                   F' x = y
            #                   0 <= x <= 1
            n_var = qp.F.shape[0]
            m_var = qp.F.shape[1]
            x = cvxpy.Variable(n_var)
            y = cvxpy.Variable(m_var)

            objective = cvxpy.Minimize(
                cvxpy.quad_form(x, qp.D) + cvxpy.quad_form(y, spa.eye(m_var)) +
                -1 / qp.gammas[i] * qp.mu * x)
            constraints = [
                np.ones(n_var) * x == 1, qp.F.T * x == y, 0 <= x, x <= 1
            ]
            problem = cvxpy.Problem(objective, constraints)
            problem.solve(solver=cvxpy.ECOS, verbose=False)

            # Obtain time and number of iterations
            time[i] = problem.solver_stats.setup_time + \
                problem.solver_stats.solve_time

            niter[i] = problem.solver_stats.num_iters

            # # DEBUG: Solve with MOSEK
            # Amosek = spa.vstack((qp.A,
            #                      spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))
            #                                 ))).tocsc()
            # lmosek = np.append(qp.l, qp.lx)
            # umosek = np.append(qp.u, qp.ux)
            # prob = mpbpy.QuadprogProblem(qp.P, qp.q[:, i],
            #                              Amosek, lmosek, umosek)
            # res = prob.solve(solver=mpbpy.MOSEK, verbose=False)
            # x_mosek = res.x[:n_var]
            # import ipdb; ipdb.set_trace()

    else:
        raise ValueError('Solver not understood')

    # Return statistics
    return utils.Statistics(time), utils.Statistics(niter)
Пример #35
0
def _ht_2d(
	true_corr, # list of correlations for each group
	cells, # list of Nx2 sparse matrices
	approx_sf,
	design_matrix,
	Nc_list,
	num_boot,
	treatment_idx,
	q,
	_estimator_1d,
	_estimator_cov,
	resampling,
	**kwargs):
	
		
	good_idxs = np.zeros(design_matrix.shape[0], dtype=bool)
	
	# the bootstrap arrays
	boot_corr = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan
	
	# Get strata-specific pooled information
	if resampling == 'permutation':
		
		uniq_strata, strata_indicator = np.unique(np.delete(design_matrix, treatment_idx, axis=1), axis=0, return_inverse=True)
		resampling_info = {}
		
		for k in range(uniq_strata.shape[0]):
			
			strata_idx = np.where(strata_indicator==0)[0]
			data_list = [cells[i] for i in strata_idx]
			sf_list = [approx_sf[i] for i in strata_idx]
		
			resampling_info[k] = bootstrap._unique_expr(sparse.vstack(data_list, format='csc'), np.concatenate(sf_list))

	for group_idx in range(design_matrix.shape[0]):

		# Skip if any of the 2d moments are NaNs
		if np.isnan(true_corr[group_idx]) or (np.abs(true_corr[group_idx]) == 1):
			continue

		# Fill in the true value
		boot_corr[group_idx, 0] = true_corr[group_idx]
		
		# Generate the bootstrap values
		cov, var_1, var_2 = bootstrap._bootstrap_2d(
			data=cells[group_idx],
			size_factor=approx_sf[group_idx],
			num_boot=int(num_boot),
			q=q[group_idx],
			_estimator_1d=_estimator_1d,
			_estimator_cov=_estimator_cov,
			precomputed=(None if resampling == 'bootstrap' else resampling_info[strata_indicator[group_idx]]))
				
		corr = estimator._corr_from_cov(cov, var_1, var_2, boot=True)
			
		# This replicate is good
		boot_corr[group_idx, 1:] = corr#[:num_boot]
		vals = _fill_corr(boot_corr[group_idx, :])
		
		# Skip if all NaNs
		if np.all(np.isnan(vals)):
			continue
		
		good_idxs[group_idx] = True
		boot_corr[group_idx, :] = vals

	# Skip this gene
	if good_idxs.sum() == 0:
		return np.nan, np.nan, np.nan
	
	vals = _regress_2d(
			design_matrix=design_matrix[good_idxs, :],
			boot_corr=boot_corr[good_idxs, :],
			Nc_list=Nc_list[good_idxs],
			treatment_idx=treatment_idx,
			resampling=resampling,
			**kwargs)
	
	return vals
Пример #36
0
def train_gen(batch_size=1000, mwr=0.3, distil_temp=1.0, ret_hashes=False):

    if distil_temp != 1.0:
        with open(
                processed_dir +
                "distil/y_good_distill.mwr-{}.temp-{}.pickle".format(
                    mwr, distil_temp), "rb") as f:
            y_good_distill = pickle.load(f)

        with open(
                processed_dir +
                "distil/y_mal_distill.mwr-{}.temp-{}.pickle".format(
                    mwr, distil_temp), "rb") as f:
            y_mal_distill = pickle.load(f)

    while True:
        perm_good = np.random.permutation(train_good)
        perm_mal = np.random.permutation(train_mal)

        y = np.zeros((batch_size, ), dtype=np.int8)

        mal_batch = int(batch_size * mwr)

        i = 0
        j = 0
        while i < train_mal:
            if train_mal - i < mal_batch:
                mal_batch = train_mal - i
            good_batch = int((mal_batch / mwr) * (1 - mwr))

            full_batch = mal_batch + good_batch

            if full_batch != batch_size:
                y = np.zeros((full_batch, ), dtype=np.int8)

            good_idx = roll(perm_good, j, (j + good_batch) % train_good)
            mal_idx = perm_mal[i:i + mal_batch]

            x_m = vstack([x_manifest[good_idx], x_manifest_mal[mal_idx]])
            x_c = vstack([x_code[good_idx], x_code_mal[mal_idx]])

            if distil_temp != 1.0:
                y[:good_batch] = y_good_distill[good_idx]
                y[good_batch:] = y_mal_distill[mal_idx]
            else:
                y[:good_batch] = 0
                y[good_batch:] = 1

            if ret_hashes:
                hashes = []
                for idx in good_idx:
                    hashes.append(good_hashes[idx])
                for idx in mal_idx:
                    hashes.append(mal_hashes[idx])

                yield ([x_m, x_c], y, hashes)
            else:
                yield ([x_m, x_c], y)

            i = i + mal_batch
            j = (j + good_batch) % train_good
Пример #37
0
    def _cross_prod(self):
        s = self.ent_table
        r = self.att_table
        k = self.kfkds
        ns = k[0].shape[0]
        ds = s.shape[1]
        nr = [t.shape[0] for t in self.att_table]
        dr = [t.shape[1] for t in self.att_table]
        if not self.trans:
            if s.size > 0:
                res = self._t_cross(s)
            else:
                res = np.zeros((ns, ns), dtype=float, order='C')
            if all(map(sp.issparse, r)):
                cross_r = [self._t_cross(t).toarray() for t in r]
            else:
                cross_r = [self._t_cross(t) for t in r]
            comp.expand_add(ns, len(k), k, cross_r, nr, res)

            return res
        else:
            if all(map(sp.issparse, self.att_table)):
                other = np.ones((1, ns))
                v = [
                    np.zeros((1, t.shape[0]), dtype=float)
                    for t in self.att_table
                ]
                comp.group(ns, len(k), 1, k, nr, other, v)
                size = self.att_table[0].size
                data = np.empty(size)

                # part 2 and 3 are p.T and p
                comp.multiply_sparse(size, self.att_table[0].row,
                                     self.att_table[0].data, np.sqrt(v[0]),
                                     data)
                diag_part = self._cross(
                    sp.coo_matrix((data, (self.att_table[0].row,
                                          self.att_table[0].col))))
                if ds > 0:
                    m = np.zeros((nr[0], ds))
                    comp.group_left(ns, ds, s, k[0], m)
                    p = self._cross(self.att_table[0], m)
                    s_part = self._cross(self.ent_table)

                    res = sp.vstack((np.hstack(
                        (s_part, p.T)), sp.hstack((p, diag_part))))
                else:
                    res = diag_part

                # multi-table join
                for i in range(1, len(k)):
                    ps = []
                    if ds > 0:
                        m = np.zeros((nr[i], ds))
                        comp.group_left(ns, ds, s, k[i], m)
                        ps += [self._cross(self.att_table[i], m)]

                    # cp (KRi)
                    size = self.att_table[i].size
                    data = np.empty(size)
                    comp.multiply_sparse(size, self.att_table[i].row,
                                         self.att_table[i].data, np.sqrt(v[i]),
                                         data)
                    diag_part = self._cross(
                        sp.coo_matrix((data, (self.att_table[i].row,
                                              self.att_table[i].col))))

                    for j in range(i):
                        ps += [r[i].tocsr()[k[i]].T.dot(r[j].tocsr()[k[j]])]

                    res = sp.vstack((sp.hstack(
                        (res, sp.vstack([p.T for p in ps]))),
                                     sp.hstack(ps + [diag_part])))
            else:
                nt = self.ent_table.shape[1] + sum(
                    [att.shape[1] for att in self.att_table])
                other = np.ones((1, ns))
                v = [
                    np.zeros((1, t.shape[0]), dtype=float)
                    for t in self.att_table
                ]
                res = np.empty((nt, nt))

                data = np.empty(self.att_table[0].shape, order='C')
                comp.group(ns, len(k), 1, k, nr, other, v)
                comp.multiply(self.att_table[0].shape[0],
                              self.att_table[0].shape[1], self.att_table[0],
                              v[0], data)
                res[ds:ds + dr[0], ds:ds + dr[0]] = self._cross(data)

                if ds > 0:
                    m = np.zeros((nr[0], ds))
                    comp.group_left(ns, ds, s, k[0], m)
                    res[ds:ds + dr[0], :ds] = self._cross(self.att_table[0], m)
                    res[:ds, ds:ds + dr[0]] = res[ds:ds + dr[0], :ds].T
                    res[:ds, :ds] = self._cross(self.ent_table)

                # multi-table join
                for i in range(1, len(self.kfkds)):
                    if ds > 0:
                        m = np.zeros((nr[i], ds))
                        comp.group_left(ns, ds, s, k[i], m)
                        ni1 = ds + sum(
                            [t.shape[1] for t in self.att_table[:i]])
                        ni2 = ni1 + self.att_table[i].shape[1]
                        res[ni1:ni2, :ds] = self._cross(self.att_table[i], m)
                        res[:ds, ni1:ni2] = res[ni1:ni2, :ds].T

                    # cp(KRi)
                    data = np.empty(self.att_table[i].shape, order='C')
                    comp.multiply(self.att_table[i].shape[0],
                                  self.att_table[i].shape[1],
                                  self.att_table[i], v[i], data)
                    res[ni1:ni2, ni1:ni2] = self._cross(data)

                    for j in range(i):
                        dj1 = ds + sum(
                            [t.shape[1] for t in self.att_table[:j]])
                        dj2 = dj1 + self.att_table[j].shape[1]

                        if (ns * 1.0 / nr[j]) > (1 + nr[j] * 1.0 / dr[j]):
                            m = np.zeros((nr[i], nr[j]), order='C')
                            comp.group_k_by_k(nr[i], nr[j], ns, k[i], k[j], m)

                            res[ni1:ni2, dj1:dj2] = r[i].T.dot(m.T.dot(r[j]))
                            res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T
                        else:
                            res[ni1:ni2,
                                dj1:dj2] = r[i][k[i]].T.dot(r[j][k[j]])
                            res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T
            return res
Пример #38
0
def simulate_dataset_with_ambient_rna(
        n_cells: int = 150,
        n_empty: int = 300,
        clusters: int = 3,
        n_genes: int = 10000,
        d_cell: int = 5000,
        d_empty: int = 100,
        cells_in_clusters: Union[List[int], None] = None,
        ambient_different: bool = False,
        chi_input: Union[np.ndarray, None] = None) \
        -> Tuple[sp.csr.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
    """Simulate a dataset with ambient background RNA counts.

    Empty drops have ambient RNA only, while barcodes with cells have cell
    RNA plus some amount of ambient background RNA (in proportion to the
    sizes of cell and droplet).

    Args:
        n_cells: Number of cells.
        n_empty: Number of empty droplets with only ambient RNA.
        clusters: Number of distinct cell types to simulate.
        n_genes: Number of genes.
        d_cell: Cell size scale factor.
        d_empty: Empty droplet size scale factor.
        cells_in_clusters: Number of cells of each cell type.  If specified,
            the number of ints in this list must be equal to clusters.
        ambient_different: If False, the gene expression profile of ambient
            RNA is drawn from the sum of cellular gene expression.  If True,
            the ambient RNA expression is completely different from cellular
            gene expression.
        chi_input: Gene expression arrays in a matrix, with rows as clusters and
            columns as genes.  Expression should add to one for each row.
            Setting chi=None will generate new chi randomly according to a
            Dirichlet distribution.

    Returns:
        csr_barcode_gene_synthetic: The simulated barcode by gene matrix of
            UMI counts, as a scipy.sparse.csr.csr_matrix.
        z: The simulated cell type identities.  A numpy array of integers,
            one for each barcode. The number 0 is used to denote barcodes
            without a cell present.
        chi: The simulated gene expression, one corresponding to each z.
            Access the vector of gene expression for a given z using chi[z, :].
        d: The simulated size scale factors, one for each barcode.

    """

    assert d_cell > 0, "Location parameter, d_cell, of LogNormal " \
                       "distribution must be greater than zero."
    assert d_empty > 0, "Location parameter, d_cell, of LogNormal " \
                        "distribution must be greater than zero."
    assert clusters > 0, "clusters must be a positive integer."
    assert n_cells > 0, "n_cells must be a positive integer."
    assert n_empty > 0, "n_empty must be a positive integer."
    assert n_genes > 0, "n_genes must be a positive integer."
    if chi_input is not None:
        assert chi_input.shape[0] == clusters, "Chi was specified, but the " \
                                               "number  of rows must match " \
                                               "the number of clusters."
        assert chi_input.shape[1] == n_genes, "Chi was specified, but the " \
                                              "number of columns must match " \
                                              "the number of genes."

    # Figure out how many cells are in each cell cluster.
    if cells_in_clusters is None:
        # No user input: make equal numbers of each cell type
        cells_in_clusters = (np.ones(clusters, dtype=int) *
                             int(n_cells / clusters))
    else:
        assert len(cells_in_clusters) == clusters, "len(cells_in_clusters) " \
                                                   "must equal clusters."
        assert sum(cells_in_clusters) == n_cells, "sum(cells_in_clusters) " \
                                                  "must equal n_cells."

    # Initialize arrays and lists.
    chi = np.zeros((clusters + 1, n_genes))
    csr_list = []
    z = []
    d = []

    if chi_input is not None:

        # Go with the chi that was input.
        chi[1:, :] = chi_input

    else:

        # Get chi for cell expression.
        for i in range(1, clusters + 1):
            chi[i, :] = generate_chi(alpha=0.01, n_genes=n_genes)

    # Get chi for ambient expression.  This becomes chi[0, :].
    if ambient_different:

        # Ambient expression is unrelated to cells, and is itself random.
        chi[0, :] = generate_chi(alpha=0.001, n_genes=n_genes)  # Sparse

    else:

        # Ambient gene expression comes from the sum of cell expression.
        for i in range(1, clusters + 1):

            chi[0, :] += cells_in_clusters[i - 1] * chi[i, :]  # Weighted sum

    chi[0, :] = chi[0, :] / np.sum(chi[0, :])  # Normalize

    # Sample gene expression for ambient.
    csr, d_n = sample_expression_from(chi[0, :],
                                      n=n_empty,
                                      d_mu=np.log(d_empty).item())

    # Add data to lists.
    csr_list.append(csr)
    z = z + [0 for _ in range(csr.shape[0])]
    d = d + [i for i in d_n]

    # Sample gene expression for cells.
    for i in range(1, clusters + 1):

        # Get chi for cells once ambient expression is added.
        chi_tilde = chi[i, :] * d_cell + chi[0, :] * d_empty
        chi_tilde = chi_tilde / np.sum(chi_tilde)  # Normalize
        csr, d_n = sample_expression_from(chi_tilde,
                                          n=cells_in_clusters[i - 1],
                                          d_mu=np.log(d_cell).item())

        # Add data to lists.
        csr_list.append(csr)
        z = z + [i for _ in range(csr.shape[0])]
        d = d + [j for j in d_n]

    # Package the results.
    csr_barcode_gene_synthetic = sp.vstack(csr_list)
    z = np.array(z)
    d = np.array(d)

    # Permute the barcode order and return results.
    order = np.random.permutation(z.size)
    csr_barcode_gene_synthetic = csr_barcode_gene_synthetic[order, ...]
    z = z[order]
    d = d[order]

    return csr_barcode_gene_synthetic, z, chi, d
Пример #39
0
def simulate_dataset_without_ambient_rna(
    n_cells: int = 100,
    clusters: int = 1,
    n_genes: int = 10000,
    cells_in_clusters: Union[List[int], None] = None,
    d_cell: int = 5000
) -> Tuple[sp.csr.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
    """Simulate a dataset with ambient background RNA counts.

    Empty drops have ambient RNA only, while barcodes with cells have cell RNA
    plus some amount of ambient background RNA (in proportion to the sizes of
    cell and droplet).

    Args:
        n_cells: Number of cells.
        clusters: Number of distinct cell types to simulate.
        n_genes: Number of genes.
        d_cell: Cell size scale factor.
        cells_in_clusters: Number of cells of each cell type.  If specified,
            the number of ints in this list must be equal to clusters.

    Returns:
        csr_barcode_gene_synthetic: The simulated barcode by gene matrix of UMI
            counts, as a scipy.sparse.csr.csr_matrix.
        z: The simulated cell type identities.  A numpy array of integers, one
            for each barcode.  The number 0 is used to denote barcodes
            without a cell present.
        chi: The simulated gene expression, one corresponding to each z.
            Access the vector of gene expression for a given z using chi[z, :].
        d: The simulated size scale factors, one for each barcode.

    """

    assert d_cell > 0, "Location parameter, d_cell, of LogNormal " \
                       "distribution must be greater than zero."
    assert clusters > 0, "clusters must be a positive integer."
    assert n_cells > 0, "n_cells must be a positive integer."
    assert n_genes > 0, "n_genes must be a positive integer."

    # Figure out how many cells are in each cell cluster.
    if cells_in_clusters is None:
        # No user input: make equal numbers of each cell type
        cells_in_clusters = np.ones(clusters) * int(n_cells / clusters)
    else:
        assert len(cells_in_clusters) == clusters, "len(cells_in_clusters) " \
                                                   "must equal clusters."
        assert sum(cells_in_clusters) == n_cells, "sum(cells_in_clusters) " \
                                                  "must equal n_cells."

    # Initialize arrays and lists.
    chi = np.zeros((clusters + 1, n_genes))
    csr_list = []
    z = []
    d = []

    # Get chi for cell expression.
    for i in range(clusters):
        chi[i, :] = generate_chi(alpha=1.0, n_genes=n_genes)
        csr, d_n = sample_expression_from(chi[i, :],
                                          n=int(cells_in_clusters[i]),
                                          d_mu=np.log(d_cell).item())
        csr_list.append(csr)
        z = z + [i for _ in range(csr.shape[0])]
        d = d + [j for j in d_n]

    # Package the results.
    csr_barcode_gene_synthetic = sp.vstack(csr_list)
    z = np.array(z)
    d = np.array(d)

    # Permute the barcode order and return results.
    order = np.random.permutation(z.size)
    csr_barcode_gene_synthetic = csr_barcode_gene_synthetic[order, ...]
    z = z[order]
    d = d[order]

    return csr_barcode_gene_synthetic, z, chi, d
Пример #40
0
    def _one_fit(self):
        if self.verbose:
            print("\nCreating synthetic doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        if self.verbose:
            print("Normalizing...")
        if self.normalizer is not None:
            aug_counts = self.normalizer(
                sp_sparse.vstack((self._raw_counts, self._raw_synthetics)))
        else:
            # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts
            synth_lib_size = np.sum(self._raw_synthetics, axis=1).A1
            aug_lib_size = np.concatenate([self._lib_size, synth_lib_size])
            normed_synths = self._raw_synthetics.copy()
            inplace_csr_row_normalize_l1(normed_synths)
            aug_counts = sp_sparse.vstack(
                (self._normed_raw_counts, normed_synths))
            aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1)

        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        aug_counts = anndata.AnnData(aug_counts)
        aug_counts.obs["n_counts"] = aug_lib_size
        if self.standard_scaling is True:
            sc.pp.scale(aug_counts, max_value=15)

        if self.verbose:
            print("Running PCA...")
        sc.tl.pca(aug_counts,
                  n_comps=self.n_components,
                  random_state=self.random_state)
        if self.verbose:
            print("Clustering augmented data set...\n")
        sc.pp.neighbors(aug_counts,
                        random_state=self.random_state,
                        method="umap",
                        n_neighbors=10)
        if self.use_phenograph:
            fullcommunities, _, _ = phenograph.cluster(
                aug_counts.obsm["X_pca"], **self.phenograph_parameters)
        else:
            sc.tl.louvain(aug_counts,
                          random_state=self.random_state,
                          resolution=4,
                          directed=False)
            fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        if self.verbose:
            print("Found clusters [{0}, ... {2}], with sizes: {1}\n".format(
                min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_log_p_values = {
            i: hypergeom.logsf(
                synth_cells_per_comm[i],
                aug_counts.shape[0],
                self._synthetics.shape[0],
                synth_cells_per_comm[i] + orig_cells_per_comm[i],
            )
            for i in community_IDs
        }
        log_p_values = np.array(
            [community_log_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            log_p_values[self.communities_ == -1] = np.nan

        return scores, log_p_values
with open(os.path.join(".", "data", vectorizer_name), "rb") as handle:
	Vectorizer = pickle.load(handle)
	
feature_list = []	#Hold feature arrays (sparse numpy arrays)
meta_list = []		#Hold meta-data (Python list)
	
#For each line, get text features and meta-data
for line in Texts:
	
	text_id = line[0]
	speaker_id = line[1]
	line = line[2]
	features = Vectorizer.transform(line)
	
	try:
		meta_data = speakers[speaker_id][class_name]
		feature_list.append(features)
		meta_list.append(meta_data)
		
	except:
		print("Missing meta-data for " + str(speaker_id))
	
#Now merge into dataframe
features = vstack(feature_list)
meta = np.array(meta_list)

filename = "Senate." + vectorizer_name + ".Features"
save_npz(os.path.join(in_dir, filename), features, compressed = True)

filename = "Senate." + vectorizer_name + ".Classes"
np.save(os.path.join(in_dir, filename), meta, allow_pickle = True)
Пример #42
0
def _ht_1d(
	true_mean, # list of means
	true_res_var, # list of residual variances
	cells, # list of sparse vectors/matrices
	approx_sf, # list of dense arrays
	design_matrix,
	Nc_list,
	num_boot,
	treatment_idx,
	mv_fit, # list of tuples
	q, # list of numbers
	_estimator_1d,
	resampling,
	**kwargs):
	
	good_idxs = np.zeros(design_matrix.shape[0], dtype=bool)
	
	# the resampled arrays
	boot_mean = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan
	boot_var = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan
	
	# Get strata-specific pooled information
	if resampling == 'permutation':
		
		uniq_strata, strata_indicator = np.unique(np.delete(design_matrix, treatment_idx, axis=1), axis=0, return_inverse=True)
		resampling_info = {}
		
		for k in range(uniq_strata.shape[0]):
			
			strata_idx = np.where(strata_indicator==0)[0]
			data_list = [cells[i] for i in strata_idx]
			sf_list = [approx_sf[i] for i in strata_idx]
		
			resampling_info[k] = bootstrap._unique_expr(sparse.vstack(data_list, format='csc'), np.concatenate(sf_list))

	for group_idx in range(len(true_mean)):

		# Skip if any of the 1d moments are NaNs
		if np.isnan(true_mean[group_idx]) or \
		   np.isnan(true_res_var[group_idx]) or \
		   true_mean[group_idx] == 0 or \
		   true_res_var[group_idx] < 0:
			continue

		# Fill in the true value
		boot_mean[group_idx, 0], boot_var[group_idx, 0] = np.log(true_mean[group_idx]), np.log(true_res_var[group_idx])
		
		# Generate the bootstrap values
		mean, var = bootstrap._bootstrap_1d(
			data=cells[group_idx],
			size_factor=approx_sf[group_idx],
			num_boot=num_boot,
			q=q[group_idx],
			_estimator_1d=_estimator_1d,
			precomputed= (None if resampling == 'bootstrap' else resampling_info[strata_indicator[group_idx]]))
		
		# Compute the residual variance
		res_var = estimator._residual_variance(mean, var, mv_fit[group_idx])
		
		# Minimize invalid values
		filled_mean = _fill(mean)#_push_nan(mean)#[:num_boot]
		filled_var = _fill(res_var)#_push_nan(res_var)#[:num_boot]
		
		# Make sure its a valid replicate
		if filled_mean is None or filled_var is None:
			continue
		
		boot_mean[group_idx, 1:] = np.log(filled_mean)
		boot_var[group_idx, 1:] = np.log(filled_var)
		
		# This replicate is good
		good_idxs[group_idx] = True
		
	# Skip this gene
	if good_idxs.sum() == 0:
		return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
	
	vals = _regress_1d(
			design_matrix=design_matrix[good_idxs, :],
			boot_mean=boot_mean[good_idxs, :], 
			boot_var=boot_var[good_idxs, :],
			Nc_list=Nc_list[good_idxs],
			treatment_idx=treatment_idx,
			resampling=resampling,
			**kwargs)
	return vals
Пример #43
0
def run(fold):
    # load the training data with folds
    df = pd.read_csv("../inputs/cat-in-the-dat-train-folds.csv")

    # extracting the categorical features
    features = [ x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # Handling NaN values by replacing with NONE
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # training dataset
    df_train = df[df["kfold"] != fold].reset_index(drop=True)
    
    # validation dataset
    df_valid = df[df["kfold"] == fold].reset_index(drop=True)
    
    # full_data = pd.concat(
    #         [df_train[features], df_valid[features]], 
    #         axis=0
    # ) 
    
    full_data = df[features]
    
    # initalize the OneHotEncoder() from sklearn
    ohe = preprocessing.OneHotEncoder()
    
    # fit to the data
    ohe.fit(full_data[features])
    
    # transform training dataset
    x_train = ohe.transform(df_train[features])
     
    # transform validation dataset
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Truncated SVD
    # we are reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)
    
    # fit svd on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)
    
    # transform the training sparse data    
    x_train = svd.transform(x_train)
    
    # transform the validation sparse data
    x_valid = svd.transform(x_valid)
    
    # initialize the RandomForestClassifier
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit the data to the model
    model.fit(x_train, df_train.target.values)
    
    # predict only the ones for the given x_valid dataset,
    # need to select ones so [:,1] -- all rows of 2nd column, 1st column for zeros 
    yhat_ones = model.predict_proba(x_valid)[:,1]
        
    # evaluate the auc score
    auc = metrics.roc_auc_score(df_valid.target.values, yhat_ones)
    
    print(f"Fold: {fold}, AUC Score: {auc}")
Пример #44
0
    def estimate(self, ppci):
        # state vector built from delta, |V| and zero injections
        # Find pq bus with zero p,q and shunt admittance
        zero_injection_bus = np.argwhere(
            ppci["bus"][:, bus_cols + ZERO_INJ_FLAG] == True).ravel()
        ppci["bus"][
            zero_injection_bus,
            [bus_cols + P, bus_cols + P_STD, bus_cols + Q, bus_cols +
             Q_STD]] = np.NaN
        # Withn pq buses with zero injection identify those who have also no p or q measurement
        p_zero_injections = zero_injection_bus
        q_zero_injections = zero_injection_bus
        new_states = np.zeros(len(p_zero_injections) + len(q_zero_injections))

        slack_buses, non_slack_buses, n_active, r_inv, v_m, delta_masked, delta, z = self.wls_preprocessing(
            ppci)

        E = np.concatenate((delta_masked.compressed(), v_m, new_states))
        # matrix calculation object
        sem = WLSAlgebraZeroInjectionConstraints(ppci, slack_buses,
                                                 non_slack_buses)

        current_error = 100.
        cur_it = 0
        G_m, r, H, h_x = None, None, None, None

        while current_error > self.tolerance and cur_it < self.max_iterations:
            self.logger.debug("Starting iteration {:d}".format(1 + cur_it))
            try:
                # create h(x) for the current iteration
                h_x, c_x = sem.create_hx_cx(v_m, delta, p_zero_injections,
                                            q_zero_injections)

                # residual r
                r = csr_matrix(z - h_x).T
                c_rxh = csr_matrix(c_x).T

                # jacobian matrix H
                H_temp, C_temp = sem.create_jacobian(v_m, delta,
                                                     p_zero_injections,
                                                     q_zero_injections)
                H = csr_matrix(H_temp)
                C = csr_matrix(C_temp)

                # gain matrix G_m
                # G_m = H^t * R^-1 * H
                G_m = H.T * (r_inv * H)

                # building a new gain matrix for new constraints.
                A_1 = vstack([G_m, C])
                c_ax = hstack([C, np.zeros((C.shape[0], C.shape[0]))])
                c_xT = c_ax.T
                M_tx = csr_matrix(hstack(
                    (A_1, c_xT)))  # again adding to the new gain matrix
                rhs = H.T * (r_inv * r)  # original right hand side
                C_rhs = vstack(
                    (rhs, -c_rxh
                     ))  # creating the righ hand side with new constraints

                # state vector difference d_E
                d_E = spsolve(M_tx, C_rhs)
                E += d_E

                # update V/delta
                delta[non_slack_buses] = E[:len(non_slack_buses)]
                v_m = np.squeeze(E[len(non_slack_buses):len(non_slack_buses) +
                                   n_active])

                # prepare next iteration
                cur_it += 1
                current_error = np.max(
                    np.abs(d_E[:len(non_slack_buses) + n_active]))
                self.logger.debug(
                    "Current error: {:.7f}".format(current_error))

            except np.linalg.linalg.LinAlgError:
                self.logger.error(
                    "A problem appeared while using the linear algebra methods."
                    "Check and change the measurement set.")
                return False

        # check if the estimation is successfull
        self.check_result(current_error, cur_it)
        return delta, v_m
Пример #45
0
# large as the order of the PDE being solved (2 in this
# case). Larger values may improve accuracy

# generate nodes
nodes, smpid = menodes(N, vert, smp)
edge_idx, = (smpid >= 0).nonzero()
interior_idx, = (smpid == -1).nonzero()
# create "left hand side" matrix
A_int = weight_matrix(nodes[interior_idx],
                      nodes,
                      diffs=[[2, 0], [0, 2]],
                      n=n,
                      basis=basis,
                      order=order)
A_edg = weight_matrix(nodes[edge_idx], nodes, diffs=[0, 0])
A = vstack((A_int, A_edg))
# create "right hand side" vector
d_int = -1 * np.ones_like(interior_idx)
d_edg = np.zeros_like(edge_idx)
d = np.hstack((d_int, d_edg))
# find the solution at the nodes
u_soln = spsolve(A, d)
# interpolate the solution on a grid
xg, yg = np.meshgrid(np.linspace(-0.05, 2.05, 400),
                     np.linspace(-0.05, 2.05, 400))
points = np.array([xg.flatten(), yg.flatten()]).T
u_itp = LinearNDInterpolator(nodes, u_soln)(points)
# mask points outside of the domain
u_itp[~contains(points, vert, smp)] = np.nan
ug = u_itp.reshape((400, 400))  # fold back into a grid
# make a contour plot of the solution
Пример #46
0
print("Categories:", np.unique(train_labels))
print("Number of unique words:", len(np.unique(np.hstack(train_data))))

# 将word_index反转,实现将整数索引到单词的映射
'''
# Simple Vectoring data
print('Vectoring data')
X_train = pre.vectorize_sequences(train_data)
X_test = pre.vectorize_sequences(test_data)
'''
# TF-IDF Vectoring data
print('\nVectoring train data')
X_train, train_labels = tfidf.tf_idf_2doc(train_data, train_labels, feat=10000)
print('\nVectoring test data')
X_test, test_labels = tfidf.tf_idf_2doc(test_data, test_labels, feat=10000)
data = sp.vstack((X_train, X_test))

# Vectoring label
print('\nVectoring labels')
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
target = np.append(y_train, y_test)
'''
X_val = X_test[: 10000]
partial_x_train = X_test[10000:]

y_val = y_test[: 10000]
partial_y_train = y_test[10000:]
'''
train_x = data[10000:]
train_y = target[10000:]
Пример #47
0
IDs_arestas_0_locais = np.subtract(IDs_arestas_0, ni + nf)
IDs_faces_0_locais = np.subtract(IDs_faces_0, ni)
IDs_internos_0_locais = IDs_internos_0

IDs_arestas_1_locais = np.setdiff1d(range(na), IDs_arestas_0_locais)

ids_arestas_slin_m0 = np.nonzero(As['Aev'].sum(axis=1))[0]

Aev = As['Aev']
Ivv = As['Ivv']
Aif = As['Aif']
Afe = As['Afe']
invAee = lu_inv4(As['Aee'].tocsc(), ids_arestas_slin_m0)
M2 = -invAee * Aev
PAD = vstack([M2, Ivv])

invAff = invbAff
M3 = -invAff * (Afe * M2)

PAD = vstack([M3, PAD])
invAii = invbAii
PAD = vstack([-invAii * (Aif * M3), PAD])
print("get_OP_AMS", time.time() - ta1)

del M3

ids_1 = mb.tag_get_data(L1_ID_tag, vertices, flat=True)
fine_to_primal1_classic_tag = mb.tag_get_handle('FINE_TO_PRIMAL1_CLASSIC')
ids_class = mb.tag_get_data(fine_to_primal1_classic_tag, vertices, flat=True)
t0 = time.time()
Пример #48
0
    G_zz,
    parallels_x.dot(out['xz']) + parallels_y.dot(out['yz']) +
    parallels_z.dot(out['zz']), idx['boundary:roller'])

# stack the components together. take care to delete matrices when
# we do not need them anymore
del (out, normals_x, normals_y, normals_z, parallels_1, parallels_2,
     parallels_x, parallels_y, parallels_z)

G_x = sp.hstack((G_xx, G_xy, G_xz))
del G_xx, G_xy, G_xz
G_y = sp.hstack((G_yx, G_yy, G_yz))
del G_yx, G_yy, G_yz
G_z = sp.hstack((G_zx, G_zy, G_zz))
del G_zx, G_zy, G_zz
G = sp.vstack((G_x, G_y, G_z))
del G_x, G_y, G_z
G = G.tocsc()
G.eliminate_zeros()

# create the right-hand-side vector
d_x = np.zeros((N, ))
d_y = np.zeros((N, ))
d_z = np.zeros((N, ))

d_x[idx['interior']] = 0.0
d_x[idx['ghosts:free']] = 0.0
d_x[idx['ghosts:roller']] = 0.0
d_x[idx['boundary:free']] = 0.0
d_x[idx['boundary:roller']] = 0.0
Пример #49
0
    def _non_rigid_icp_iter(self, source, target, closest_points_on_target, M_kron_G, alpha, gamma):
        """
        Non-rigid icp for each iteration.

        Parameters:
            source (menpo.shape.mesh.base.TriMesh): original source mesh to be transformed
            target (menpo.shape.mesh.base.TriMesh): target mesh as the base
            closest_points_on_target (menpo3d.vtkutils.VTKClosestPointLocator): octree for finding nearest neighbor
            M_kron_G (scipy.sparse.coo.coo_matrix): matrix M kron matrix G
            alpha (float): stiffness weight
            gamma (float): data weight

        Returns:
            current_instance (menpo.shape.mesh.base.TriMesh): transformed source mesh
            training_info (dict): containing 3 lists of loss/regularized_err/err while training
        """
        # init transformation
        n_dims = source.n_dims
        h_dims = n_dims + 1
        n = source.points.shape[0]
        v_i = source.points

        # we need to prepare some indices for efficient construction of the D sparse matrix.
        row = np.hstack((np.repeat(np.arange(n)[:, None], n_dims, axis=1).ravel(), np.arange(n)))
        x = np.arange(n * h_dims).reshape((n, h_dims))
        col = np.hstack((x[:, :n_dims].ravel(), x[:, n_dims]))
        ones = np.ones(n)
        alpha_M_kron_G = alpha * M_kron_G

        # start iteration
        training_info = {'loss': [], 'regularized_loss': []}
        iter_ = 0
        while iter_ < self.max_iter:
            iter_ += 1
            NonRigidIcp._iter_counter += 1

            # find nearest neighbour and the normals
            U, tri_indices = closest_points_on_target(v_i)

            data = np.hstack((v_i.ravel(), ones))
            D = sp.coo_matrix((data, (row, col)))

            to_stack_A = [alpha_M_kron_G, D]
            to_stack_B = [np.zeros((alpha_M_kron_G.shape[0], n_dims)), U]

            A = sp.vstack(to_stack_A).tocsr()
            B = sp.vstack(to_stack_B).tocsr()

            X = math_helper.Solver.linear_solver(A, B, self.solver)

            # deform template
            v_i = np.array(D.dot(X))

            loss = np.linalg.norm(A @ X - B, ord='fro')
            regularized_loss = loss / len(source.points)
            training_info['loss'].append(loss)
            training_info['regularized_loss'].append(regularized_loss)

            NonRigidIcp._average_regularized_loss = (NonRigidIcp._iter_counter - 1) * \
                                                    NonRigidIcp._average_regularized_loss / NonRigidIcp._iter_counter

            if self.verbose:
                info = ' - {} loss: {:.3f} regularized_loss: {:.3f}  '.format(iter_, loss, regularized_loss)
                print(info)
            else:
                progress_bar = "["
                if NonRigidIcp._num_of_meshes is not None:
                    progress = int(10.0 * NonRigidIcp._mesh_counter / NonRigidIcp._num_of_meshes)
                    for _ in range(progress-1):
                        progress_bar += "="
                    progress_bar += ">"
                    for _ in range(10 - progress - 1):
                        progress_bar += "."
                    progress_bar += "] " + str(NonRigidIcp._mesh_counter) + "/" + str(NonRigidIcp._num_of_meshes)
                else:
                    progress_bar += str(NonRigidIcp._num_of_meshes) + "]"
                if self._expected_remaining_time is not None:
                    progress_bar += " | remaining time: " + self._expected_remaining_time

                print(("loss @ this iter: {:.3f} | "
                      "loss/iter: {:.3f} | "
                       + progress_bar)
                      .format(regularized_loss,
                              NonRigidIcp._average_regularized_loss
                              ), end="\r", flush=True)

            if regularized_loss < self.eps:
                break

        current_instance = source.copy()
        current_instance.points = v_i.copy()

        return current_instance, training_info
Пример #50
0
    def perform_EM(self, X_l, y_l, X_u):
        nb_clf = MultinomialNB(alpha=0.01)
        nb_clf.fit(X_l, y_l)

        # calculate log likelihood
        class_log_prior = (nb_clf.class_log_prior_).tolist()
        word_given_class = nb_clf.feature_log_prob_
        class_size = len(nb_clf.class_count_)
        un_sum_outer = 0
        for doc in X_u:
            sum_inner = 0
            for index in range(class_size):
                sum_inner += (class_log_prior[index] *
                              np.sum(word_given_class[index, :]))
            un_sum_outer += sum_inner

        lb_sum = 0
        for index, doc in enumerate(X_l):
            sum_inner = 0
            given_label = y_l[index]
            sum_inner = (class_log_prior[given_label] *
                         np.sum(word_given_class[given_label, :]))
            lb_sum += sum_inner

        log_likelihood = (-1 * (lb_sum + un_sum_outer))
        prev_log = float("-inf")
        current_log = log_likelihood
        count = 0
        # remove this line
        while (abs(current_log - prev_log) > 1e-6):
            # Estimation step
            Y_u = nb_clf.predict(X_u)

            # Maximize step
            X_new = vstack([X_l, X_u])
            Y_new = np.concatenate((y_l, Y_u), axis=0)
            nb_clf.fit(X_new, Y_new)

            # calculate log likelihood
            class_log_prior = (nb_clf.class_log_prior_).tolist()
            word_given_class = nb_clf.feature_log_prob_
            class_size = len(nb_clf.class_count_)
            count += 1
            un_sum_outer = 0
            for doc in X_u:
                sum_inner = 0
                for index in range(class_size):
                    sum_inner += (class_log_prior[index] *
                                  np.sum(word_given_class[index, :]))
                un_sum_outer += sum_inner

            lb_sum = 0
            for index, doc in enumerate(X_l):
                sum_inner = 0
                given_label = y_l[index]
                sum_inner = (class_log_prior[given_label] *
                             np.sum(word_given_class[given_label, :]))
                lb_sum += sum_inner

            log_likelihood = (-1 * (lb_sum + un_sum_outer))
            prev_log = current_log
            current_log = log_likelihood
            print("log_likelihood ", log_likelihood)
        return nb_clf
Пример #51
0
def load_gcn_data(dataset_str):
    npz_file = 'data/{}_{}.npz'.format(dataset_str, FLAGS.normalization)
    if os.path.exists(npz_file):
        start_time = time()
        print('Found preprocessed dataset {}, loading...'.format(npz_file))
        data = np.load(npz_file)
        num_data     = data['num_data']
        labels       = data['labels']
        train_data   = data['train_data']
        val_data     = data['val_data']
        test_data    = data['test_data']
        train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape'])
        full_adj = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape'])
        feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape'])
        train_feats = sp.csr_matrix((data['train_feats_data'], data['train_feats_indices'], data['train_feats_indptr']), shape=data['train_feats_shape'])
        test_feats = sp.csr_matrix((data['test_feats_data'], data['test_feats_indices'], data['test_feats_indptr']), shape=data['test_feats_shape'])
        print('Finished in {} seconds.'.format(time() - start_time))
    else:
        """Load data."""
        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
                if sys.version_info > (3, 0):
                    objects.append(pkl.load(f, encoding='latin1'))
                else:
                    objects.append(pkl.load(f))

        x, y, tx, ty, allx, ally, graph = tuple(objects)

        if dataset_str != 'nell':
            test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
            test_idx_range = np.sort(test_idx_reorder)

            if dataset_str == 'citeseer':
                # Fix citeseer dataset (there are some isolated nodes in the graph)
                # Find isolated nodes, add them as zero-vecs into the right position
                test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
                tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
                tx_extended[test_idx_range-min(test_idx_range), :] = tx
                tx = tx_extended
                ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
                ty_extended[test_idx_range-min(test_idx_range), :] = ty
                ty = ty_extended

            features = sp.vstack((allx, tx)).tolil()
            features[test_idx_reorder, :] = features[test_idx_range, :]
            adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

            labels = np.vstack((ally, ty))
            labels[test_idx_reorder, :] = labels[test_idx_range, :]

            idx_test = test_idx_range.tolist()
            # idx_train = range(len(y)) 
            idx_train = range(18217) 
            idx_val = range(len(y), len(y)+500)

            train_mask = sample_mask(idx_train, labels.shape[0])
            val_mask = sample_mask(idx_val, labels.shape[0])
            test_mask = sample_mask(idx_test, labels.shape[0])

            y_train = np.zeros(labels.shape)
            y_val = np.zeros(labels.shape)
            y_test = np.zeros(labels.shape)
            y_train[train_mask, :] = labels[train_mask, :]
            y_val[val_mask, :] = labels[val_mask, :]
            y_test[test_mask, :] = labels[test_mask, :]
        else:
            test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
            features = allx.tocsr()
            adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
            labels = ally
            idx_test = test_idx_reorder
            idx_train = range(len(y))
            idx_val = range(len(y), len(y)+969)
            train_mask = sample_mask(idx_train, labels.shape[0])
            val_mask = sample_mask(idx_val, labels.shape[0])
            test_mask = sample_mask(idx_test, labels.shape[0])
            y_train = np.zeros(labels.shape)
            y_val = np.zeros(labels.shape)
            y_test = np.zeros(labels.shape)
            y_train[train_mask, :] = labels[train_mask, :]
            y_val[val_mask, :] = labels[val_mask, :]
            y_test[test_mask, :] = labels[test_mask, :]

        # num_data, (v, coords), feats, labels, train_d, val_d, test_d
        num_data = features.shape[0]
        def _normalize_adj(adj):
            rowsum = np.array(adj.sum(1)).flatten()
            d_inv  = 1.0 / (rowsum+1e-20)
            d_mat_inv = sp.diags(d_inv, 0)
            adj = d_mat_inv.dot(adj).tocoo()
            coords = np.array((adj.row, adj.col)).astype(np.int32)
            return adj.data.astype(np.float32), coords

        def gcn_normalize_adj(adj):
            adj = adj + sp.eye(adj.shape[0])
            rowsum = np.array(adj.sum(1)) + 1e-20
            d_inv_sqrt = np.power(rowsum, -0.5).flatten()
            d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
            d_mat_inv_sqrt = sp.diags(d_inv_sqrt, 0)
            adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
            adj = adj.tocoo()
            coords = np.array((adj.row, adj.col)).astype(np.int32)
            return adj.data.astype(np.float32), coords

        # Normalize features
        rowsum = np.array(features.sum(1)) + 1e-9
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv, 0)
        features = r_mat_inv.dot(features)

        if FLAGS.normalization == 'gcn':
            full_v, full_coords = gcn_normalize_adj(adj)
        else:
            full_v, full_coords = _normalize_adj(adj)
        full_v = full_v.astype(np.float32)
        full_coords = full_coords.astype(np.int32)
        train_v, train_coords = full_v, full_coords
        labels = (y_train + y_val + y_test).astype(np.float32)
        train_data = np.nonzero(train_mask)[0].astype(np.int32)
        val_data   = np.nonzero(val_mask)[0].astype(np.int32)
        test_data  = np.nonzero(test_mask)[0].astype(np.int32)

        feats = (features.data, features.indices, features.indptr, features.shape)

        def _get_adj(data, coords):
            adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), 
                                shape=(num_data, num_data))
            return adj

        train_adj = _get_adj(train_v, train_coords)
        full_adj  = _get_adj(full_v,  full_coords)
        feats = sp.csr_matrix((feats[0], feats[1], feats[2]), 
                              shape=feats[-1], dtype=np.float32)

        train_feats = train_adj.dot(feats)
        test_feats  = full_adj.dot(feats)

        with open(npz_file, 'wb') as fwrite:
            np.savez(fwrite, num_data=num_data, 
                             train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape,
                             full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape,
                             feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape,
                             train_feats_data=train_feats.data, train_feats_indices=train_feats.indices, train_feats_indptr=train_feats.indptr, train_feats_shape=train_feats.shape,
                             test_feats_data=test_feats.data, test_feats_indices=test_feats.indices, test_feats_indptr=test_feats.indptr, test_feats_shape=test_feats.shape,
                             labels=labels,
                             train_data=train_data, val_data=val_data, 
                             test_data=test_data)

    return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
Пример #52
0
def main(args):
    start_time = time.time()
    print("Running XGBoost Classifier")

    print("Reading blacklist words file")
    load_blacklist_words("../data/blacklist.txt")

    print("Reading raw gender-comment data")
    with open("../data/male-comments.json", "r") as f:
        male_comment = json.load(f)
    with open("../data/female-comments.json", "r") as f:
        female_comment = json.load(f)

    # Lower case all comments
    male_comment = [[x[0], x[1].lower()] for x in male_comment]
    female_comment = [[x[0], x[1].lower()] for x in female_comment]

    # Filter blacklisted words in comments
    male_comment = [[x[0], x[1]] for x in male_comment
                    if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))]
    female_comment = [[x[0], x[1]] for x in female_comment if all(
        c not in BLACKLIST_WORDS for c in x[1].split(" "))]

    random.shuffle(male_comment)
    random.shuffle(female_comment)
    print("Loaded {} male and {} female comments".format(
        len(male_comment), len(female_comment)))

    female_ratio = 1.0 - args.male_female_ratio
    if args.limit != -1:
        print(
            "Limiting male and female comments to {} male and {} female ({} total)"
            .format(int(args.limit * args.male_female_ratio),
                    int(args.limit * female_ratio), args.limit))
        try:
            del male_comment[int(args.limit * args.male_female_ratio):]
            del female_comment[int(args.limit * female_ratio):]
        except:
            print("Not enough male/female comments data")
            sys.exit(1)

    gender_comment = []
    for idx, data in enumerate(male_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    for idx, data in enumerate(female_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    random.shuffle(gender_comment)

    list_of_words = set()
    for data in gender_comment:
        list_of_words.update(data[1].split(" "))
    list_of_words = list(list_of_words)
    word_count = len(list_of_words)

    if args.cache:
        cache.cache_list_of_words(list_of_words)

    print("Total of {} words found\n".format(word_count))

    data = coo_matrix((1, 1))
    label = []
    total = len(gender_comment)
    start_progress("Processing {} raw gender-comment data".format(total))
    for i, j in enumerate(gender_comment):
        if j[0] == "female":  # Label for female = 0, and male = 1
            label.append(0)
        else:
            label.append(1)

        wc = {}
        for word in j[1].split():
            if word in wc:
                wc[word] += 1
            else:
                wc[word] = 1

        d = []
        for idx in range(word_count):
            count = 0
            if list_of_words[idx] in wc:
                count = wc[list_of_words[idx]]
            d.append(count)

        if i == 0:
            data = coo_matrix(d)
        else:
            data = vstack((data, coo_matrix(d)))

        progress((i + 1) / total * 100)
        if i == total:
            break
    end_progress()

    if args.cache:
        cache.cache_data_and_label(data, label, word_count)

    run_tests(data, label, total, args.split, args.gamma, args.learning_rate,
              args.n_estimators)

    print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
Пример #53
0
    def _fit_resample(self, X, y):
        self._validate_estimator()
        random_state = check_random_state(self.random_state)
        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.svm_estimator_.fit(X, y)
            support_index = self.svm_estimator_.support_[y[
                self.svm_estimator_.support_] == class_sample]
            support_vector = _safe_indexing(X, support_index)

            self.nn_m_.fit(X)
            noise_bool = self._in_danger_noise(self.nn_m_,
                                               support_vector,
                                               class_sample,
                                               y,
                                               kind="noise")
            support_vector = _safe_indexing(
                support_vector, np.flatnonzero(np.logical_not(noise_bool)))
            danger_bool = self._in_danger_noise(self.nn_m_,
                                                support_vector,
                                                class_sample,
                                                y,
                                                kind="danger")
            safety_bool = np.logical_not(danger_bool)

            self.nn_k_.fit(X_class)
            fractions = random_state.beta(10, 10)
            n_generated_samples = int(fractions * (n_samples + 1))
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nn_k_.kneighbors(
                    _safe_indexing(support_vector,
                                   np.flatnonzero(danger_bool)),
                    return_distance=False,
                )[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    _safe_indexing(support_vector,
                                   np.flatnonzero(danger_bool)),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_generated_samples,
                    step_size=1.0,
                )

            if np.count_nonzero(safety_bool) > 0:
                nns = self.nn_k_.kneighbors(
                    _safe_indexing(support_vector,
                                   np.flatnonzero(safety_bool)),
                    return_distance=False,
                )[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    _safe_indexing(support_vector,
                                   np.flatnonzero(safety_bool)),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_samples - n_generated_samples,
                    step_size=-self.out_step,
                )

            if (np.count_nonzero(danger_bool) > 0
                    and np.count_nonzero(safety_bool) > 0):
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack(
                        [X_resampled, X_new_1, X_new_2])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
                y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
                                             axis=0)
            elif np.count_nonzero(danger_bool) == 0:
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack([X_resampled, X_new_2])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_2))
                y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
            elif np.count_nonzero(safety_bool) == 0:
                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack([X_resampled, X_new_1])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1))
                y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)

        return X_resampled, y_resampled
Пример #54
0
    data_all, _ = split_dataframe_partial_user_holdout(
        df_all,
        "userId",
        "movieId",
        test_user_ratio=0.2,
        val_user_ratio=0.2,
        heldout_ratio_test=0.5,
        heldout_ratio_val=0.5,
    )

    data_train = data_all["train"]
    data_val = data_all["val"]
    data_test = data_all["test"]

    X_train_all: sps.csr_matrix = sps.vstack(
        [data_train.X_train, data_val.X_train, data_test.X_train],
        format="csr")
    X_train_val_all: sps.csr_matrix = sps.vstack(
        [data_train.X_all, data_val.X_all, data_test.X_train], format="csr")
    valid_evaluator = Evaluator(
        ground_truth=data_val.X_test,
        offset=data_train.n_users,
        cutoff=BASE_CUTOFF,
    )
    test_evaluator = Evaluator(
        ground_truth=data_test.X_test,
        offset=data_train.n_users + data_val.n_users,
        cutoff=BASE_CUTOFF,
    )

    test_results = []
Пример #55
0
def osqp_solve_qp(P,
                  q,
                  G=None,
                  h=None,
                  A=None,
                  b=None,
                  initvals=None,
                  verbose=False,
                  eps_abs=1e-5,
                  eps_rel=1e-5,
                  polish=True):
    """
    Solve a Quadratic Program defined as:

    .. math::

        \\begin{split}\\begin{array}{ll}
        \\mbox{minimize} &
            \\frac{1}{2} x^T P x + q^T x \\\\
        \\mbox{subject to}
            & G x \\leq h                \\\\
            & A x = h
        \\end{array}\\end{split}

    using `OSQP <https://github.com/oxfordcontrol/osqp>`_.

    Parameters
    ----------
    P : scipy.sparse.csc_matrix
        Symmetric quadratic-cost matrix.
    q : numpy.array
        Quadratic cost vector.
    G : scipy.sparse.csc_matrix
        Linear inequality constraint matrix.
    h : numpy.array
        Linear inequality constraint vector.
    A : scipy.sparse.csc_matrix, optional
        Linear equality constraint matrix.
    b : numpy.array, optional
        Linear equality constraint vector.
    initvals : numpy.array, optional
        Warm-start guess vector.
    verbose : bool, optional
        Set to `True` to print out extra information.
    eps_abs : scalar, optional
        Absolute convergence tolerance of the solver. Lower values yield more
        precise solutions at the cost of computation time.
    eps_rel : scalar, optional
        Relative convergence tolerance of the solver. Lower values yield more
        precise solutions at the cost of computation time.
    polish : bool, optional
        Perform `polishing <https://osqp.org/docs/solver/#polishing>`_, an
        additional step where the solver tries to improve the accuracy of the
        solution. Default is ``True``.

    Returns
    -------
    x : array, shape=(n,)
        Solution to the QP, if found, otherwise ``None``.

    Note
    ----
    OSQP requires `P` to be symmetric, and won't check for errors otherwise.
    Check out for this point if you e.g. `get nan values
    <https://github.com/oxfordcontrol/osqp/issues/10>`_ in your solutions.

    Note
    ----
    As of OSQP v0.6.1, the default values for both absolute and relative
    tolerances are set to ``1e-3``, which results in low solver times but
    imprecise solutions compared to the other QP solvers. We lower them to
    ``1e-5`` so that OSQP behaves closer to the norm in terms of numerical
    accuracy.
    """
    if type(P) is ndarray:
        warn(conversion_warning("P"))
        P = csc_matrix(P)
    solver = OSQP()
    kwargs = {
        'eps_abs': eps_abs,
        'eps_rel': eps_rel,
        'polish': polish,
        'verbose': verbose
    }
    if A is None and G is None:
        solver.setup(P=P, q=q, **kwargs)
    elif A is not None:
        if type(A) is ndarray:
            warn(conversion_warning("A"))
            A = csc_matrix(A)
        if G is None:
            solver.setup(P=P, q=q, A=A, l=b, u=b, **kwargs)
        else:  # G is not None
            l = -inf * ones(len(h))
            qp_A = vstack([G, A]).tocsc()
            qp_l = hstack([l, b])
            qp_u = hstack([h, b])
            solver.setup(P=P, q=q, A=qp_A, l=qp_l, u=qp_u, **kwargs)
    else:  # A is None
        if type(G) is ndarray:
            warn(conversion_warning("G"))
            G = csc_matrix(G)
        l = -inf * ones(len(h))
        solver.setup(P=P, q=q, A=G, l=l, u=h, **kwargs)
    if initvals is not None:
        solver.warm_start(x=initvals)
    res = solver.solve()
    if hasattr(solver, 'constant'):
        success_status = solver.constant('OSQP_SOLVED')
    else:  # more recent versions of OSQP
        success_status = osqp.constant('OSQP_SOLVED')
    if res.info.status_val != success_status:
        print("OSQP exited with status '%s'" % res.info.status)
    return res.x
Пример #56
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_m_.fit(X)
            danger_index = self._in_danger_noise(self.nn_m_,
                                                 X_class,
                                                 class_sample,
                                                 y,
                                                 kind="danger")
            if not any(danger_index):
                continue

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(_safe_indexing(X_class, danger_index),
                                        return_distance=False)[:, 1:]

            # divergence between borderline-1 and borderline-2
            if self.kind == "borderline-1":
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    n_samples,
                )
                if sparse.issparse(X_new):
                    X_resampled = sparse.vstack([X_resampled, X_new])
                else:
                    X_resampled = np.vstack((X_resampled, X_new))
                y_resampled = np.hstack((y_resampled, y_new))

            elif self.kind == "borderline-2":
                random_state = check_random_state(self.random_state)
                fractions = random_state.beta(10, 10)

                # only minority
                X_new_1, y_new_1 = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    X_class,
                    nns,
                    int(fractions * (n_samples + 1)),
                    step_size=1.0,
                )

                # we use a one-vs-rest policy to handle the multiclass in which
                # new samples will be created considering not only the majority
                # class but all over classes.
                X_new_2, y_new_2 = self._make_samples(
                    _safe_indexing(X_class, danger_index),
                    y.dtype,
                    class_sample,
                    _safe_indexing(X, np.flatnonzero(y != class_sample)),
                    nns,
                    int((1 - fractions) * n_samples),
                    step_size=0.5,
                )

                if sparse.issparse(X_resampled):
                    X_resampled = sparse.vstack(
                        [X_resampled, X_new_1, X_new_2])
                else:
                    X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
                y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))

        return X_resampled, y_resampled
Пример #57
0
            n_ds = float(sum(ds_rep == ds))
            if n_ds == 0:  # 0 log 0 = 0
                continue
            H += (n_ds / n_cluster) * np.log(n_ds / n_cluster)
        H *= -1
        H /= np.log(len(datasets))

        Hs.append(H)

    return np.mean(Hs)


if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names, norm=False)
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    gt_idx = [i for i, s in enumerate(np.sum(X != 0, axis=1)) if s >= 500]
    X = X[gt_idx]

    if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)):
        log('Dimension reduction with {}...'.format(METHOD))
        X_dimred = reduce_dimensionality(normalize(X),
                                         method=METHOD,
                                         dimred=DIMRED)
        log('Dimensionality = {}'.format(X_dimred.shape[1]))
        np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred)
    else:
        X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(
            METHOD, NAMESPACE))
Пример #58
0
                                           log1p=True)
datasets.append(train_X)
genes_list.append(tms_genes_list)
data_names_all.append('TMS')

## embedd the cell ontology
unseen_l, l2i, i2l, onto_net, Y_emb, cls2cls = ParseCLOnto(train_Y_str)
train_Y = MapLabel2CL(train_Y_str, l2i)

## use Scanorama to correct batch effects
datasets, genes = merge_datasets(datasets, genes_list)
datasets_dimred, genes = process_data(datasets, genes, dimred=100)
expr_datasets = my_assemble(datasets_dimred,
                            ds_names=data_names_all,
                            expr_datasets=datasets,
                            sigma=150)[1]
expr_corrected = sparse.vstack(expr_datasets)
expr_corrected = np.log2(expr_corrected.toarray() + 1)

## annotate 26-datasets, train on TMS
ntrain, ngene = np.shape(train_X)
nsample = np.shape(expr_corrected)[0]
train_X_corrected = expr_corrected[nsample - ntrain:, :]
test_X_corrected = expr_corrected[:nsample - ntrain, :]
OnClass_obj = OnClassPred()
OnClass_obj.train(train_X_corrected, train_Y, Y_emb, log_transform=False)
test_Y_pred = OnClass_obj.predict(test_X_corrected, log_transform=False)

## save the prediction matrix, nsample (number of samples in 26-datasets) by nlabels
np.save(output_dir + '26_datasets_predicted_score_matrix.npy', test_Y_pred)
Пример #59
0
    def _cross_prod_w(self, w):
        # Calculate X * A * X.T. A is a diagnalized matrix, and w is the array of diagnal of A.

        w = w.astype(float)

        s = self.ent_table
        r = self.att_table
        k = self.kfkds

        ns = k[0].shape[0]
        ds = s.shape[1]
        nr = [t.shape[0] for t in r]
        dr = [t.shape[1] for t in r]

        if not self.trans:
            if s.size > 0:

                res = self._t_cross_w(s, w[0:ds])
            else:
                res = np.zeros((ns, ns), dtype=float, order='C')

            count = ds
            cross_r = []
            for t in r:
                if all(map(sp.issparse, r)):
                    cross_r.append(
                        self._t_cross_w(t,
                                        w[count:count + t.shape[1]]).toarray())
                else:
                    cross_r.append(
                        self._t_cross_w(t, w[count:count + t.shape[1]]))

                count += t.shape[1]

            comp.expand_add(ns, len(k), k, cross_r, nr, res)
        else:

            if all(map(sp.issparse, r)):
                # change the 'other' as weight to group
                other = w.reshape((1, -1)).astype(float)
                s2 = w.reshape(-1, 1) * np.array(s)
                v = [np.zeros((1, t.shape[0]), dtype=float) for t in r]
                comp.group(ns, len(k), 1, k, nr, other, v)
                size = r[0].size
                data = np.empty(size)

                # part 2 and 3 are p.T and p
                comp.multiply_sparse(size, r[0].row, r[0].data, np.sqrt(v[0]),
                                     data)
                diag_part = self._cross(
                    sp.coo_matrix((data, (r[0].row, r[0].col))))
                if ds > 0:
                    m = np.zeros((nr[0], ds))
                    comp.group_left(ns, ds, s2, k[0], m)
                    p = self._cross(r[0], m)
                    s_part = self._cross(s, s2)

                    res = sp.vstack((np.hstack(
                        (s_part, p.T)), sp.hstack((p, diag_part))))
                else:
                    res = diag_part

                # multi-table join
                for i in range(1, len(k)):
                    ps = []
                    if ds > 0:
                        m = np.zeros((nr[i], ds))
                        comp.group_left(ns, ds, s2, k[i], m)
                        ps += [self._cross(r[i], m)]

                    # cp (KRi)
                    size = r[i].size
                    data = np.empty(size)
                    comp.multiply_sparse(size, r[i].row, r[i].data,
                                         np.sqrt(v[i]), data)
                    diag_part = self._cross(
                        sp.coo_matrix((data, (r[i].row, r[i].col))))

                    for j in range(i):
                        ps += [
                            r[i].tocsr()[k[i]].T.dot(
                                r[j].tocsr()[k[j]].multiply(w.reshape(-1, 1)))
                        ]

                    res = sp.vstack((sp.hstack(
                        (res, sp.vstack([p.T for p in ps]))),
                                     sp.hstack(ps + [diag_part])))

            else:
                nt = s.shape[1] + sum([att.shape[1] for att in r])
                other = w.reshape((1, -1)).astype(float)
                s2 = w.reshape(-1, 1) * np.array(s)
                v = [np.zeros((1, t.shape[0]), dtype=float) for t in r]
                res = np.empty((nt, nt))

                data = np.empty(r[0].shape, order='C')
                comp.group(ns, len(k), 1, k, nr, other, v)
                comp.multiply(r[0].shape[0], r[0].shape[1], r[0], v[0], data)
                res[ds:ds + dr[0], ds:ds + dr[0]] = self._cross(data)

                if ds > 0:
                    m = np.zeros((nr[0], ds))
                    comp.group_left(ns, ds, s2, k[0], m)
                    res[ds:ds + dr[0], :ds] = self._cross(r[0], m)
                    res[:ds, ds:ds + dr[0]] = res[ds:ds + dr[0], :ds].T
                    res[:ds, :ds] = self._cross(s, s2)

                # multi-table join
                for i in range(1, len(k)):

                    if ds > 0:
                        m = np.zeros((nr[i], ds))
                        comp.group_left(ns, ds, s2, k[i], m)
                        ni1 = ds + sum([t.shape[1] for t in r[:i]])
                        ni2 = ni1 + r[i].shape[1]
                        res[ni1:ni2, :ds] = self._cross(r[i], m)
                        res[:ds, ni1:ni2] = res[ni1:ni2, :ds].T

                    # cp(KRi)
                    data = np.empty(r[i].shape, order='C')
                    comp.multiply(r[i].shape[0], r[i].shape[1], r[i], v[i],
                                  data)
                    res[ni1:ni2, ni1:ni2] = self._cross(data)

                    for j in range(i):
                        dj1 = ds + sum([t.shape[1] for t in r[:j]])
                        dj2 = dj1 + r[j].shape[1]
                        if (ns * 1.0 / nr[j]) > (1 + nr[j] * 1.0 / dr[j]):
                            m = np.zeros((nr[i], nr[j]), order='C')
                            # Update in comp.cpp. When count the number in each group, add w instead of 1.
                            comp.group_k_by_k_w(nr[i], nr[j], ns, w, k[i],
                                                k[j], m)

                            res[ni1:ni2, dj1:dj2] = r[i].T.dot(m.T.dot(r[j]))
                            res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T
                        else:
                            res[ni1:ni2,
                                dj1:dj2] = (w.reshape(-1, 1) *
                                            np.array(r[i][k[i]])).T.dot(
                                                r[j][k[j]])
                            res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T

        return res
Пример #60
0
def qrfullsps(K):
    """
    Full QR-factorization for a sparse potentially singular mxn matrix, with m>=n.

    Parameters
    ----------
    K : ndarray
        matrix, that is to be inverted

    Returns
    -------
    Kinv : ndarray
        pseudoinverse of K
    R : ndarray
        left nullspace of K
    """
    R = copy(K)
    Q = csr_matrix(eye(K.shape[0]))
    for j in range(K.shape[1]):
        G = None
        for k in range(K.shape[0] - 1, j, -1):
            vec_norm = np.linalg.norm(np.array([R[j, j], R[k, j]]))
            if not np.isclose(vec_norm, 0):
                c_j = R[j, j] / vec_norm
                s_j = R[k, j] / vec_norm
                G_help = csr_matrix(eye(K.shape[0]))
                G_help[j, j] = c_j
                G_help[k, j] = -s_j
                G_help[j, k] = s_j
                G_help[k, k] = c_j
                R = G_help @ R
                if G is None:
                    G = copy(G_help)
                else:
                    G = G_help @ G
        if G is not None:
            Q = Q @ G.T
    Q = -Q
    R = -R

    r_row_sum = np.sum(np.abs(R), axis=1)
    tol = 1.0e-12
    rank = K.shape[0] - len(np.where(r_row_sum < tol)[0])

    Q1 = Q[:, :rank]
    Q2 = Q[:, rank:]

    R1 = R[:rank, :rank]
    R2 = R[:rank, :rank]

    # Inverting R1 by back substitution
    if R1.shape[0] == R1.shape[1]:
        R_inv = csr_matrix(R1.shape)
        backward_substitution_failed = False
        for row in np.arange(R1.shape[0] - 1, -1, -1):
            if row is not R1.shape[0] - 1:
                for col in np.arange(row + 1, R1.shape[1]):
                    R_inv[row, :] -= R1[row, col] * R_inv[col, :]
            R_inv[row, row] = 1
            if np.isclose(R1[row, row], 0):
                backward_substitution_failed = True
                break
            else:
                R_inv[row, :] = R_inv[row, :] / R1[row, row]
    else:
        backward_substitution_failed = True
    if backward_substitution_failed:
        R_inv = csr_matrix(np.linalg.pinv(R.todense()))

    K_inv = R_inv @ Q1.T
    if K_inv.shape[0] < K.shape[1]:
        K_inv = vstack(
            (K_inv, lil_matrix((K.shape[1] - K_inv.shape[0], K.shape[1]))))
    if K_inv.shape[1] < K.shape[0]:
        K_inv = hstack(
            (K_inv, lil_matrix((K_inv.shape[0], K.shape[0] - K_inv.shape[1]))))

    return K_inv, Q2