def generate_data(case, sparse=False): # Generate regression / classification data. bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = { 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, } return data
def mkneighbors_graph(observations, n_neighbours, metric, mode='connectivity', metric_params = None): """ Computes the (weighted) graph of mutual k-Neighbors for observations. Notes ----- The distance between an observation and itself is never computed and instead set to ``numpy.inf``. I.e. only in the case of k>=n_observations or when the ``metric`` returns ``numpy.inf``, the returned graph can contain loops. Parameters ---------- observations : sequence Sequence of observations. n_neighbours : int Maximum number of neighbours for each sample. metric : function The distance metric taking two observations and returning a numeric value > 0. mode : {'connectivity', 'distance', 'both'}, optional Type of returned matrix: 'connectivity' will return the connectivity matrix with ones and zeros, in 'distance' the edges are distances between points, while 'both' returns a (connectivity, distance) tuple. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. Returns ------- mkneighbors_graph : ndarray Sparse matrix in CSR format, shape = [n_observations, n_observations]. mkneighbors_graph[i, j] is assigned the weight of edge that connects i to j. Might contain ``numpy.inf`` values. """ # compute their pairwise-distances pdists = pdist(observations, metric) # get the k nearest neighbours for each patch k_nearest_nbhs = numpy.argsort(pdists)[:,:n_neighbours] # create a mask denoting the k nearest neighbours in image_pdist k_nearest_mutual_nbhs_mask = numpy.zeros(pdists.shape, numpy.bool) for _mask_row, _nbhs_row in zip(k_nearest_mutual_nbhs_mask, k_nearest_nbhs): _mask_row[_nbhs_row] = True # and with transposed to remove non-mutual nearest neighbours k_nearest_mutual_nbhs_mask &= k_nearest_mutual_nbhs_mask.T # set distance not in the mutual k nearest neighbour set to zero pdists[~k_nearest_mutual_nbhs_mask] = 0 # check for edges with zero-weight if numpy.any(pdists[k_nearest_mutual_nbhs_mask] == 0): warnings.warn('The graph contains at least one edge with a weight of "0".') if 'connectivity' == mode: return csr_matrix(k_nearest_mutual_nbhs_mask) elif 'distance' == mode: return csr_matrix(pdists) else: return csr_matrix(k_nearest_mutual_nbhs_mask), csr_matrix(pdists)
def test_mutual_information(self): X = array([[0, 1], [1, 0], [1, 1]]) y = array([[0, 1], [1, 0], [1, 0]]) assert_array_approx_equal(mutual_information(X, y), [-0.37489, -0.605939], decimal=3) assert_array_approx_equal(mutual_information(csr_matrix(X), csr_matrix(y)), [-0.37489, -0.605939], decimal=3)
def test_pointwise_mutual_information(self): X = array([[0, 1], [1, 0], [1, 1]]) y = array([[0, 1], [1, 0], [1, 0]]) assert_array_approx_equal(pointwise_mutual_information(X, y), [0.1178, 0.1178], decimal=3) assert_array_approx_equal(pointwise_mutual_information(csr_matrix(X), csr_matrix(y)), [0.1178, 0.1178], decimal=3)
def test_BRKnna_no_labels_take_closest(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() print(pred) np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
def test_BRKnna_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnnb_predict_two_samples(self): data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
def test_inner_kneighbors_more_neighbors(self): X = csr.csr_matrix([[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3]]) y = csr.csr_matrix([[0.4, 0.4, 0.4], [2.4, 2.4, 2.4], [3.1, 3.1, 3.1], [1.1, 1.1, 1.1]]) nearest_neighbors = NearestNeighbors() nearest_neighbors.fit(X) neighbors = BatchKNeighbors(nearest_neighbors) kneighbors = neighbors._batch_kneighbors(y, n_neighbors=2, batchsize=1) np.testing.assert_array_equal(kneighbors, np.matrix([[0, 1], [2,3], [3, 2], [1,2]])) kneighbors = neighbors._batch_kneighbors(y, n_neighbors=2, batchsize=3) np.testing.assert_array_equal(kneighbors, np.matrix([[0, 1], [2,3], [3, 2], [1,2]]))
def _load_sparse_mat(filename, name): """ Load a csr matrix from HDF5 (https://stackoverflow.com/a/44282655) Parameters ---------- name: str node prefix in HDF5 hierarchy filename: str HDF5 filename Returns ---------- M : scipy.sparse.csr.csr_matrix loaded sparse matrix """ import tables from scipy.sparse import csr_matrix with tables.open_file(filename) as f: # get nodes attributes = [] for attribute in ("data", "indices", "indptr", "shape"): attributes.append( getattr(f.root, f"{name}_{attribute}").read()) # construct sparse matrix M = csr_matrix(tuple(attributes[:3]), shape=attributes[3]) return M
def test_correct_handling_equal_similarities_sparse_gk(self): sim_snn = 1. - shared_nearest_neighbors(self.distance) gamma_sparse = sparse_goodman_kruskal_index(csr_matrix(sim_snn), self.labels) gamma_efficient = goodman_kruskal_index(sim_snn, self.labels, 'similarity') return self.assertEqual(gamma_efficient, gamma_sparse)
def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: # discard self-matches: A matches A pairs = self._matches_list[self._matches_list['master_side'] != self._matches_list['dupe_side']] # rebuild graph adjacency matrix from already found matches: n = len(self._master) graph = csr_matrix( (np.full(len(pairs), 1), (pairs.master_side.to_numpy(), pairs.dupe_side.to_numpy())), shape=(n, n)) # apply scipy.csgraph's clustering algorithm (result is a 1D numpy array of length n): _, groups = connected_components(csgraph=graph, directed=True) group_of_master_index = pd.Series(groups, name='raw_group_id') # merge groups with string indices to obtain two-column DataFrame: # note: the following line automatically creates a new column named 'index' with the corresponding indices: group_of_master_index = group_of_master_index.reset_index() # Determine weights for obtaining group representatives: # 1. option-setting group_rep='first': group_of_master_index.rename(columns={'index': 'weight'}, inplace=True) method = 'first' # 2. option-setting group_rep='centroid': if self._config.group_rep == GROUP_REP_CENTROID: # reuse the adjacency matrix built above (change the 1's to corresponding cosine similarities): graph.data = pairs['similarity'].to_numpy() # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ... # ... convert to 1D numpy array (using asarray then squeeze) and then to Series: group_of_master_index['weight'] = pd.Series( np.asarray(graph.sum(axis=1)).squeeze()) method = 'idxmax' # Determine the group representatives AND merge with indices: # pandas groupby transform function and enlargement enable both respectively in one step: group_of_master_index['group_rep'] = \ group_of_master_index.groupby('raw_group_id', sort=False)['weight'].transform(method) # Prepare the output: prefix = GROUP_REP_PREFIX label = f'{prefix}{self._master.name}' if self._master.name else prefix[: -1] # use group rep indexes obtained in the last step above to select the corresponding strings: output = self._master.iloc[group_of_master_index.group_rep].rename( label).reset_index(drop=ignore_index) if isinstance(output, pd.DataFrame): output.rename(columns={ col: f'{prefix}{col}' for col in output.columns if str(col) != label }, inplace=True) if self._master_id is not None: id_label = f'{prefix}{self._master_id.name if self._master_id.name else DEFAULT_ID_NAME}' # use group rep indexes obtained above to select the corresponding string IDs: output_id = self._master_id.iloc[ group_of_master_index.group_rep].rename(id_label).reset_index( drop=True) output = pd.concat([output_id, output], axis=1) output.index = self._master.index return output.squeeze()
def dic2matrix(dic, ix=None, p=None, n=0, g=0): """ 将字典转化为FM所需的矩阵 :param dic: 一个含有多个特征的字典 :param ix: 下标的生成字典 :param n: 记录条数 :param g: 特征种类 :return: 一个二值矩阵 """ if ix is None: ix = dict() # 矩阵中1的个数 nz = n * g col_ix = np.empty(nz, dtype=int) # 勾选特征 i = 0 for k, lis in dic.items(): for t in range(len(lis)): ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1 col_ix[i + g * t] = (lis[t] - 1) * g + i i += 1 # 特征空间,即矩阵的列数 if p is None: p = np.max(col_ix) + 1 row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def _write_mat(self, filename, complete=True): d = { 'Af': csr_matrix( (self.pdim, self.pdim)) if self._af is None else self._af, 'Ad': csr_matrix( (self.ndim, self.pdim)) if self._ad is None else self._ad, 'Bf': csr_matrix( (self.mdim, self.pdim)) if self._bf is None else self._bf } if complete and self._complete: d['A'] = self._A d['B'] = self._B savemat(filename, d)
def vectorize_dic(dic, ix=None, p=None): ''' create a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) :param dic: dictionary of feature lists. Keys are name of features :param ix: index generator :param p: dimension of feature space :return: ''' if (ix == None): ix = defaultdict(count(0).next) n = len(dic.values()[0]) # num samples g = len(dic.keys()) # num groups nz = n * g # number of non-zeros col_ix = np.empty(nz, dtype=int) i = 0 for k, lis in dic.iteritems(): # append index el with k in order to prevet mapping different columns with same id to same index col_ix[i::g] = [ix[str(el) + str(k)] for el in lis] i += 1 row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) if (p == None): p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def vectorize_dic(dic,ix=None,p=None,n=0,g=0): """ dic -- dictionary of feature lists. Keys are the name of features ix -- index generator (default None) p -- dimension of featrure space (number of columns in the sparse matrix) (default None) """ if ix==None: ix = dict() nz = n * g col_ix = np.empty(nz,dtype = int) i = 0 for k,lis in dic.items(): for t in range(len(lis)): ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1 col_ix[i+t*g] = ix[str(lis[t]) + str(k)] i += 1 row_ix = np.repeat(np.arange(0,n),g) data = np.ones(nz) if p == None: p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix
def compatible(train_attrs, test_attrs): # attrs = [] # attrs.extend(train_attrs) # attrs.extend(test_attrs) # # self.cv.fit(attrs) # # train_features = self.cv.transform(train_attrs) # test_features = self.cv.transform(test_attrs) row_ind = [] col_ind = [] train_sets = [str(row).lower() for row in train_attrs] test_sets = [str(row).lower() for row in test_attrs] for (i, s_i) in enumerate(test_sets): # si = set(fi.indices) j_ind = v_compatible(s_i, train_sets) j_ind[i] = 0 ind = np.where(j_ind == 1) j_len = len(ind[0]) if (j_len > 0): row_ind.extend([i] * j_len) col_ind.extend(ind[0]) # print(i, train_attrs[i], j_len) pk = csr_matrix( (np.ones(len(row_ind), dtype='int8'), (row_ind, col_ind)), shape=(len(test_attrs), len(train_attrs))) return pk
def load_csr_graph(filename): """ Loads graph from a file. Every line is of the format "V_origin,V_destination,Edge_weight". Returns a scipy.sparse.csr_matrix with the data. """ raw = np.genfromtxt(filename, delimiter = ",", dtype = np.int32) sp_raw = csr_matrix((raw[:,2],(raw[:,0],raw[:,1])))
def vectorize_dic( dic, ix=None, p=None, n=0, g=0): #将数据处理成一个矩阵作为输入,矩阵的大小是用户数 × 电影数,使用的是scipy.sparse中的csr.csr_matrix """ dic -- dictionary of feature lists. Keys are the name of features ix -- index generator (default None) p -- dimension of featrure space (number of columns in the sparse matrix) (default None) """ if ix == None: ix = dict() nz = n * g col_ix = np.empty(nz, dtype=int) i = 0 for k, lis in dic.items(): for t in range(len(lis)): ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1 col_ix[i + t * g] = ix[str(lis[t]) + str(k)] i += 1 row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) if p == None: p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def vectorize(lil, ix=None, p=None): """ Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) parameters: ----------- lil -- list of lists (dimension of inner lists should be the same) ix -- index generator (default None) p -- dimension of featrure space (number of columns in the sparse matrix) (default None) """ if (ix == None): ix = defaultdict(count(0).next) n = len(lil[0]) # num samples g = len(lil) # num groups nz = n * g # number of non-zeros col_ix = np.empty(nz, dtype=int) for i, d in enumerate(lil): # append index k with __i in order to prevet mapping different columns with same id to same index col_ix[i::g] = [ix[str(k) + '__' + str(i)] for k in d] row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) if (p == None): p = len(ix) # only features that are less than p (siz of feature vector) are considered ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def vectorize_dic(dic, label2index=None, hold_num=None): if (label2index == None): d = count(0) label2index = defaultdict(lambda: next(d)) sample_num = len(list(dic.values())[0]) # num samples feat_num = len(list(dic.keys())) # num of features total_value_num = sample_num * feat_num # number of non-zeros col_ix = np.empty(total_value_num, dtype=int) i = 0 for k, lis in dic.items(): col_ix[i::feat_num] = [label2index[str(el) + str(k)] for el in lis] i += 1 row_ix = np.repeat(np.arange(sample_num), feat_num) data = np.ones(total_value_num) if (hold_num == None): hold_num = len(label2index) left_data_index = np.where(col_ix < hold_num) return csr.csr_matrix((data[left_data_index], (row_ix[left_data_index], col_ix[left_data_index])), shape=(sample_num, hold_num)), label2index
def statistical_inefficiencies(dtrajs, lag, C=None, truncate_acf=True, mact=2.0): """ Computes statistical inefficiencies of sliding-window transition counts at given lag Consider a discrete trajectory :math`{ x_t }` with :math:`x_t \in {1, ..., n}`. For each starting state :math:`i`, we collect the target sequence .. mathh: Y^(i) = {x_{t+\tau} | x_{t}=i} which contains the time-ordered target states at times :math:`t+\tau` whenever we started in state :math:`i` at time :math:`t`. Then we define the indicator sequence: .. math: a^{(i,j)}_t (\tau) = 1(Y^(i)_t = j) The statistical inefficiency for transition counts :math:`c_{ij}(tau)` is computed as the statistical inefficiency of the sequence :math:`a^{(i,j)}_t (\tau)`. Parameters ---------- dtrajs : list of int-iterables discrete trajectories lag : int lag time C : scipy sparse matrix (n, n) or None sliding window count matrix, if already available truncate_acf : bool, optional, default=True When the normalized autocorrelation function passes through 0, it is truncated in order to avoid integrating random noise Returns ------- I : scipy sparse matrix (n, n) Statistical inefficiency matrix with a sparsity pattern identical to the sliding-window count matrix at the same lag time. Will contain a statistical inefficiency :math:`I_{ij} \in (0,1]` whenever there is a count :math:`c_{ij} > 0`. When there is no transition count (:math:`c_{ij} = 0`), the statistical inefficiency is 0. See also -------- msmtools.util.statistics.statistical_inefficiency used to compute the statistical inefficiency for conditional trajectories """ # count matrix if C is None: C = count_matrix_coo2_mult(dtrajs, lag, sliding=True, sparse=True) # split sequences splitseq = _split_sequences_multitraj(dtrajs, lag) # compute inefficiencies I, J = C.nonzero() it = (statistical_inefficiency(_indicator_multitraj(splitseq, i, j), truncate_acf=truncate_acf, mact=mact) for i, j in zip(I, J)) data = np.fromiter(it, dtype=float, count=C.nnz) res = csr_matrix((data, (I, J)), shape=C.shape) return res
def fetch(self, id_dict:dict, del_other=False, norm=False): train_cat_set = set() for id in id_dict['train']: for x in self.data[id]: train_cat_set.add(x) output = {} for split, id_list in id_dict.items(): row, col, val = [], [], [] for i, id in enumerate(id_list): num = len(self.data[id]) for x in self.data[id]: xx = x if not xx in train_cat_set: xx = 0 if xx != 0 or del_other == False: row.append(i) col.append(xx) if norm: val.append(1.0 * self.id2val.get(xx, 1) / num) else: val.append(self.id2val.get(xx, 1)) output[split] = csr_matrix((np.asarray(val), (np.asarray(row), np.asarray(col))), shape=(len(id_list), self.length)) return output
def create_csr_matrix(dic, index=None, dim=None): ''' 将数据集的原始列表输入转为一个csr矩阵 :param dic: :param index: :param dim: :return: ''' if index == None: d = count(0) #创建一个从0开始,step为1的无限迭代器 index = defaultdict( lambda: next(d)) #defaultdict的作用是当key不存在的时候,不报错而返回默认值 sample_num = len(list(dic.values())[0]) #样本数:90570 feature_num = len(list(dic.keys())) #特征数:2 total_num = sample_num * feature_num col_ix = np.empty(total_num, dtype=int) i = 0 for k, lis in dic.items(): col_ix[i::feature_num] = [index[str(k) + str(el)] for el in lis] i += 1 row_ix = np.repeat(np.arange(sample_num), feature_num) #每一个元素重复feature_num次 data = np.ones(total_num) if dim is None: dim = len(index) left_data_index = np.where(col_ix < dim) return csr.csr_matrix((data[left_data_index], (row_ix[left_data_index], col_ix[left_data_index])), shape=(sample_num, dim)), index
def vectorize_dic(dic, label2index=None, hold_num=None): if label2index == None: d = count(0) label2index = defaultdict(lambda: next(d)) # 数值映射表 sample_num = len(list(dic.values())[0]) # 样本数 feat_num = len(list(dic.keys())) # 特征数 total_value_num = sample_num * feat_num # 依给定的shape, 和数据类型 dtype, 返回一个一维或者多维数组,数组的元素不为空,为随机产生的数据 col_ix = np.empty(total_value_num, dtype=int) i = 0 for k, lis in dic.items(): col_ix[i::feat_num] = [label2index[str(k) + str(el)] for el in lis] i += 1 print("col_ix.shape:", col_ix.shape) print("col_ix:", col_ix) print(col_ix[0]) row_ix = np.repeat(np.arange(sample_num), feat_num) data = np.ones(total_value_num) if hold_num is None: hold_num = len(label2index) left_data_index = np.where(col_ix < hold_num) # 为了剔除不在train set中出现的test set数据 return csr.csr_matrix( (data[left_data_index], (row_ix[left_data_index], col_ix[left_data_index])), shape=(sample_num, hold_num)), label2index
def vectorize(lil, ix=None, p=None): """ dic -- dictionary of feature lists. Keys are the name of features ix -- index generator (default None) p -- dimension of featrure space (number of columns in the sparse matrix) (default None) n -- number of samples g -- number of groups """ if ix == None: ix = defaultdict(count(0)) n = len(lil[0]) # num samples g = len(lil) # num groups nz = n * g col_ix = np.empty(nz, dtype=int) for i, d in enumerate(lil): # append index k with __i in order to prevet mapping different columns with same id to same index col_ix[i::g] = [ix[str(k) + '__' + str(i)] for k in d] row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) if p == None: p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def test_from_csr1(): from siconos.numerics import SBM_from_csparse, SBM_get_value from scipy.sparse.csr import csr_matrix M = csr_matrix([[1,2,3], [4,5,6], [7,8,9]]) print(M.indices) print(M.indptr) print(M.data) blocksize =3 r,SBM = SBM_from_csparse(blocksize,M) assert SBM_get_value(SBM,0,0) == 1 assert SBM_get_value(SBM,0,1) == 2 assert SBM_get_value(SBM,0,2) == 3 assert SBM_get_value(SBM,1,0) == 4 assert SBM_get_value(SBM,1,1) == 5 assert SBM_get_value(SBM,1,2) == 6 assert SBM_get_value(SBM,2,0) == 7 assert SBM_get_value(SBM,2,1) == 8 assert SBM_get_value(SBM,2,2) == 9
def data_processing_hiv(name): # requesting for data in json format (it is stored as python dictionary) data_fr_predict = request.json # creating pandas dataframe from python dictionary data = pd.DataFrame(data_fr_predict, index=[0]) # loading pickled object of random forest classifier model for HIV dataset file = open('DataProcessingHIV.pkl', 'rb') feature = joblib.load(file) label = joblib.load(file) one_hot_encode = joblib.load(file) file.close() # separating string of 8 characters into 8 different features x_values = separate_feature_column(data, feature) # using encoding same as encoding used while data pre-processing prior to model building x_values = one_hot_encode.transform(x_values) # for GaussianNB model converting sparse matrix to dense matrix if name == 'GaussianNB': x_values = csr_matrix(x_values).todense() y_values = data[label].values return x_values, y_values
def test_build_perm_docs(self): perm_vectors = csr_matrix([ [1, 0, 1], [0, 1, 1] ]) api_vectors = csr_matrix([ [1,0], [0,1] ]) perm_docs = np.array([ [1, 0], [0, 1], [1, 1] ]) result = PerRecCBR.build_perm_docs(perm_vectors, api_vectors) assert_array_equal(perm_docs, result)
def vectorize_dic(dic, ix=None, p=None, n=0, g=0): """ dic -- dictionary of feature lists. Keys are the name of features ix -- index generator (default None) p -- dimension of feature space (number of columns in the sparse matrix) (default None) """ if ix == None: ix = dict() # 用户数 * 电影数 nz = n * g col_ix = np.empty(nz, dtype=int) i = 0 for k, lis in dic.items(): for t in range(len(lis)): # 如果取到该值,则该值加1 ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1 # 取得相应位置的数字 col_ix[i + t * g] = ix[str(lis[t]) + str(k)] i += 1 row_ix = np.repeat(np.arange(0, n), g) # print('col_ix', col_ix) data = np.ones(nz) if p == None: p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def test_mul_sparse_matrix(self): # test unsymmetric times unsymmetric m = self.basic_m dense_m = m.toarray() res = m * m dense_res = np.matmul(dense_m, dense_m) self.assertFalse(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test symmetric result m = self.basic_m dense_m = m.toarray() res = m.transpose() * m dense_res = np.matmul(dense_m.transpose(), dense_m) self.assertTrue(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test unsymmetric with rectangular m = self.basic_m dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]) m2 = CSRMatrix(dense_m2) res = m * m2 dense_res = np.matmul(m.toarray(), dense_m2) self.assertFalse(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test unsymmetric with rectangular scipycsr m = self.basic_m dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]) m2 = csr_matrix(dense_m2) with self.assertRaises(Exception) as context: res = m * m2
def predict(self, X): #loaded_graph = tf.Graph() if self.pretrained_model_path: # Load model loader = tf.train.import_meta_graph(self.pretrained_model_path + '.meta') loader.restore(self.session, self.pretrained_model_path) elif self.validation_data_position: # Load model loader = tf.train.import_meta_graph(self._save_model_path + '.meta') loader.restore(self.session, self._save_model_path) prediction = np.zeros((X.shape[0], self.y.shape[1])) batch_generator = BatchGenerator(X, None, self.batch_size, False, True) prediction_steps = self._calc_num_steps(X) for i in range(prediction_steps): X_batch = batch_generator._batch_generator() preds = self._predict_batch(X_batch) binary_decided_preds = self._make_binary_decision(preds) prediction[i * self.batch_size:(i + 1) * self.batch_size, :] = binary_decided_preds.todense() result = csr_matrix(prediction) # close the session, since no longer needed session.close() return result
def vectorize_dic(dic, label2index=None, hold_num=None): if label2index == None: d = count(0) label2index = defaultdict(lambda: next(d)) # 数值映射表 sample_num = len(list(dic.values())[0]) # 样本数 feat_num = len(list(dic.keys())) # 特征数 total_value_num = sample_num * feat_num col_ix = np.empty(total_value_num, dtype=int) # 列索引 i = 0 for k, lis in dic.items(): col_ix[i::feat_num] = [label2index[str(k) + str(el)] for el in lis] # 'user'和'item'的映射 i += 1 row_ix = np.repeat(np.arange(sample_num), feat_num) data = np.ones(total_value_num) if hold_num is None: hold_num = len(label2index) left_data_index = np.where( col_ix < hold_num) # 为了剔除不在train set中出现的test set数据 return csr.csr_matrix((data[left_data_index], (row_ix[left_data_index], col_ix[left_data_index])), shape=(sample_num, hold_num)), label2index
def dataProcess(dic, ix=None, p=None, n=0, g=0): """ dic -- dictionary of feature lists. Keys are the name of features. 'user':train['user'].values ix -- index generator (default None) p -- dimension of feature space (the number of columns in the sparse matrix) (default None) """ if ix is None: ix = dict() nz = n*g col_ix = np.empty(nz, dtype=int) i = 0 for k, lis in dic.items(): for t in range(len(lis)): ix[str(lis[t]) + str(k)] = ix.get(str(lis[t])+str(k), 0) + 1 col_ix[i+t*g] = ix[str(lis[t]) + str(k)] i += 1 row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz) if p is None: p = len(ix) ixx = np.where(col_ix < p) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
def _parse_sparse_matrix(self, matrix): from scipy.sparse import csr_matrix return csr_matrix( (list(matrix.data), list(matrix.indices), list(matrix.indptr)), shape=(matrix.number_of_rows, matrix.number_of_columns), )
def _deduplicate(self) -> Union[pd.DataFrame, pd.Series]: n = len(self._master) graph = csr_matrix((np.full(len(self._matches_list), 1), (self._matches_list.master_side.to_numpy(), self._matches_list.dupe_side.to_numpy())), shape=(n, n)) raw_group_id_of_master_id = pd.DataFrame({ 'raw_group_id': pd.Series(connected_components(csgraph=graph, directed=False)[1]), 'master_id': self._master.index.to_series() }) first_master_id_in_group = raw_group_id_of_master_id.groupby('raw_group_id')['master_id']\ .first()\ .rename('new_group_id')\ .reset_index() new_group_id_of_master_id = first_master_id_in_group\ .merge(raw_group_id_of_master_id, how='left', on='raw_group_id')\ .sort_values('master_id')\ .reset_index(drop=True) output = self._master[ new_group_id_of_master_id.new_group_id].reset_index(drop=True) if self._master_id is None: return output else: output_id = self._master_id[ new_group_id_of_master_id.new_group_id].reset_index(drop=True) return pd.concat([output_id, output], axis=1)
def test_fcsr_matrix(m=None, n=None, k=None, data_n=1, density=.1, sym=False, return_np=True): while True: _k = k or np.random.randint(1, 10) if sym: _m = _n = m or np.random.randint(1, 100) else: _m = m or np.random.randint(1, 100) _n = n or np.random.randint(1, 100) rows_array, cols_array, data_array, shape = SNP_to_coo(_m, _n, density, data_n=_k, sym=sym) array_scipy = csr_matrix( (data_array.astype(FLOAT_STORAGE_np), (rows_array, cols_array)), shape) fcoo = coo_to_fcoo(rows_array, cols_array, data_array, shape) fcsr = fcoo_to_fcsr(fcoo, shape) array_sparse = fcsr_matrix(fcsr, shape) if return_np: array_np = array_sparse.to_array() yield array_np, array_sparse, array_scipy else: yield array_sparse, array_scipy
def test_bcsr_matrix(m=None, n=None, density=.1, sym=False, return_np=True): while True: if sym: _m = _n = m or np.random.randint(1, 100) else: _m = m or np.random.randint(1, 100) _n = n or np.random.randint(1, 100) rows_array, cols_array, data_array, shape = SNP_to_coo(_m, _n, density, data_n=1, sym=sym) row_p, col_i = coo_to_bcsr(_m, len(rows_array), rows_array, cols_array) array_sparse = bcsr_matrix(row_p, col_i, (_m, _n)) array_scipy = csr_matrix( (data_array.astype(FLOAT_STORAGE_np), (rows_array, cols_array)), shape) if return_np: array_np = array_scipy.toarray() yield array_np, array_sparse, array_scipy else: yield array_sparse, array_scipy
def vectorize_dict(dic, dim=None): feature_num = len(list(dic.keys())) record_num = len(list(dic.items())[0][1]) col_ix = np.zeros([feature_num*record_num]) ix = {} i = 0 for k in dic.keys(): lis = dic[k] for t in range(len(lis)): ix[str(k) + str(lis[t])] = ix.get(str(k) + str(lis[t]), 0) + 1 col_ix[t*feature_num + i] = ix[str(k) + str(lis[t])] i += 1 # ix = {} # i = 0 # count = 0 # for k in dic.keys(): # lis = dic[k] # for t in range(len(lis)): # flag = str(k) + str(lis[t]) # if flag not in ix.keys(): # ix[flag] = count # count += 1 # col_ix[t*feature_num + i] = ix[flag] if dim == None: dim = len(ix) row_ix = np.repeat(np.arange(0, record_num), feature_num) ixx = np.where(col_ix < dim) data = np.ones([feature_num*record_num]) return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=[record_num, dim]), ix
def predict_fn(input_object: str, model: Model): """ Takes parsed input and make predictions with the model loaded by model_fn. Perform prediction on the deserialized object, with the loaded model Parameters: input_object -- Deserialize request_body we can perform prediction on model -- Tensorflow Model Return: predictions -- Dict Object with Ordering Predicted by RecSys Model """ start_time = time.time() topN = 10 # User UUID user_uuid = input_object['user_uuid'] # Get watched movies watched_movies_idx = [model.item_idx[i] for i in input_object['watched_movies']] # Inverse IDX to ID inv_item_idx = dict((v, k) for k, v in model.item_idx.items()) # Transfomation Data to Sparse Data data_input = csr_matrix((np.ones(len(watched_movies_idx)), (np.zeros(len(watched_movies_idx)), watched_movies_idx)), shape=(1, model.input_dim)).toarray() data_pred = model.predict(data_input)[0] print("--- Inference time: %s seconds ---" % (time.time() - start_time)) # Sorted Recommender List idx_pred = list(set(list(range(model.input_dim))) - set(watched_movies_idx)) sorted_pred = dict( sorted( zip( list(idx_pred), list(data_pred[idx_pred].astype(float)) ), key=lambda x: x[1], reverse=True)) # Result Format result = { "status": "Ok", "evaluation": { "user_uuid": input_object['user_uuid'], "watched_movies": input_object['watched_movies'], "recommended_movie_ids": [inv_item_idx[i] for i in list(sorted_pred.keys())[:topN]], "scores": list(sorted_pred.values())[:topN], "datetime": datetime.utcnow().isoformat(sep='T', timespec='milliseconds'), "modelVersion": model.version, } } return result
def test_BRKnnb_auto_optimize_k(self): data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True) # noinspection PyUnusedLocal def fun(s, X, y_): return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]] BRKNeighborsClassifier._get_split = fun knn.fit(data, y) self.assertEquals(3, knn.n_neighbors) pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)
def make_mtx_payload(df): if hasattr(df, "sparse"): sparse_mat = csr_matrix(df.sparse.to_coo()) else: sparse_mat = vstack(x[0] for x in df.values) sink = BytesIO() mmwrite(sink, sparse_mat) return sink.getvalue()
def predict(self, X): predictions = csr_matrix((X.shape[0], self.y.shape[1])) doc_to_neighborhood_dict = self._predict_scores(X) for i in range(0,X.shape[0]): for label, _ in doc_to_neighborhood_dict[str(i + 1)]: predictions[i, label] = 1 return predictions
def load_sparse_csr(filename): ''' :param filename: str :return: scipy.sparse.csr.csr_matrix ''' filename += '.npz' loader = np.load(filename) return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),shape = loader['shape'])
def load_h5_to_csr(filename): f = tables.open_file(filename, 'r') dest = f.root.destination.read() origin = f.root.origin.read() weight = f.root.weight.read() sp_raw = csr_matrix((weight, (origin,dest))) f.close() return sp_raw
def test_mp_gammai_sparse_parallel(self): """Test parallel version equivalent to serial version.""" sim = csr_matrix(1. - self.dist) sim_s = mpgam_s(sim, 'similarity') sim_p = mutual_proximity_gammai(sim, 'similarity', mv=0, verbose=1) parallel_all_close_serial = np.allclose(sim_p.toarray(), sim_s.toarray()) return self.assertTrue(parallel_all_close_serial)
def _compute_relations(self): logger.log(logging.INFO, "Computing relations") self.relations = {} contains = self._compute_contains() self.relations['contains'] = csr_matrix(contains) self.relations['contained'] = csr_matrix(self.relations['contains'].transpose()) father = self._compute_father() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['father' + r] = dok_matrix(father[i]) siblings = self._compute_siblings() self.relations['opposed'] = dok_matrix(siblings[0]) self.relations['associated'] = dok_matrix(siblings[1]) self.relations['crossed'] = dok_matrix(siblings[2]) self.relations['twin'] = dok_matrix(siblings[3]) # self._do_inhibitions() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['child' + r] = self.relations['father' + r].transpose() # self.relations['siblings'] = sum(siblings) # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1) # self.relations['father'] = self.relations['father_substance'] + \ # self.relations['father_attribute'] + \ # self.relations['father_mode'] # self.relations['child'] = self.relations['child_substance'] + \ # self.relations['child_attribute'] + \ # self.relations['child_mode'] # self.relations['etymology'] = self.relations['father'] + self.relations['child'] table = self._compute_table_rank(self.relations['contained']) for i in range(6): self.relations['table_%d'%i] = table[i] self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary))) missing = {s for s in RELATIONS if s not in self.relations} if missing: raise ValueError("Missing relations : {%s}"%", ".join(missing)) self.relations = {reltype: csr_matrix(self.relations[reltype]) for reltype in RELATIONS}
def irconvolve(xc, x, y, h, kernel=lambda r, h: numpy.exp(- 0.5 * (r / h) ** 2)): """ default kernel is gaussian exp - 1/2 * r / h xc has to be uniform! """ xc, y, x, h = numpy.atleast_1d(xc, y, x, h) dxc = (xc[-1] - xc[0]) / (len(xc) - 1) support = 6 #first remove those are too far off good = ((x + support * h > xc[0]) \ & (x - support * h < xc[-1])) x = x[good] y = y[good] h = h[good] if len(h) > 0: # the real buffer is bigger than out to ease the normalization # still on the edge we are imperfect padding = int((2 * support + 1)* h.max() / dxc) + 1 padding = max(padding, 2) buffer = numpy.zeros(shape=len(xc) + 2 * padding) paddedxc = numpy.empty(buffer.shape, dtype=xc.dtype) paddedxc[padding:-padding] = xc # here comes the requirement xc has to be uniform. paddedxc[:padding] = xc[0] - numpy.arange(padding, 0, -1) * dxc paddedxc[-padding:] = xc[-1] + numpy.arange(1, padding +1) * dxc out = buffer[padding:-padding] assert len(out) == len(xc) assert (paddedxc[1:] > paddedxc[:-1]).all() # slow. for uniform xc/paddedxc, we can do this faster than search start = paddedxc.searchsorted(x - support * h, side='left') end = paddedxc.searchsorted(x + support * h, side='left') # tricky part, build the csr matrix for the conv operator, # only for the non-zero elements (block diagonal) N = end - start + 1 indptr = numpy.concatenate(([0], N.cumsum())) indices = numpy.repeat(start - indptr[:-1], N) + numpy.arange(N.sum()) r = numpy.repeat(x, N) - paddedxc[indices] data = kernel(r, numpy.repeat(h, N)) data[numpy.repeat(N==1, N)] = 1 data[numpy.repeat(h==0, N)] = 1 matrix = csr.csr_matrix((data, indices, indptr), shape=(len(x), len(paddedxc))) norm = numpy.repeat(matrix.sum(axis=1).flat, N) data /= norm buffer[:] = matrix.transpose() * y else: out = numpy.zeros(shape=xc.shape, dtype=y.dtype) return out
def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == "regression": bunch = datasets.load_boston() elif case == "classification": bunch = datasets.fetch_20newsgroups_vectorized(subset="all") X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} return data
def train(self, X, mean=None): print >> sys.stderr, 'WARNING: You should probably be using SparseMatPCA, ' \ 'unless your design matrix fits in memory.' n, d = X.shape # Can't subtract a sparse vector from a sparse matrix, apparently, # so here I repeat the vector to construct a matrix. mean = X.mean(axis=0) mean_matrix = csr_matrix(mean.repeat(n).reshape((d, n))).T X = X - mean_matrix super(SparsePCA, self).train(X, mean=numpy.asarray(mean).squeeze())
def __compute_sim_scores(self, refvec_matrix, allvecs_matrix, L2_norms, is_embeddings): contexts_sims = allvecs_matrix.dot(refvec_matrix) if is_embeddings: contexts_sims = (contexts_sims + 1) / 2 # map cosine to [0,1] contexts_sims = np.reshape(contexts_sims, (len(contexts_sims), 1)) contexts_sims = csr_matrix(contexts_sims.tolist()) if L2_norms != None: contexts_sims = contexts_sims.multiply(L2_norms) refvec_dp = refvec_matrix.transpose().dot(refvec_matrix) refvec_L2_norm = refvec_dp.data.max()**0.5 if len(refvec_dp.data) > 0 else 1.0 contexts_sims.data /= refvec_L2_norm # weights -1 <= cosine <= 1, but in practice greater than zero because all weights >= 0 return contexts_sims
def standardize_kinship_format(in_file, out_file): '''Read the upper-triangular part of the kinship matrix from in_file. Convert to the format expected by KinshipDao and write to out_file. Note: loads the entire file into memory.''' data = np.loadtxt(in_file, usecols=[2]) n = int(((8 * len(data) + 1) ** 0.5 - 1) / 2) idx = np.array(list(it.chain.from_iterable(xrange(k, n) for k in xrange(n)))) idx_ptr = np.concatenate(([0], np.cumsum(xrange(n, 0, -1)))) A = csr_matrix((np.maximum(data, 1e-16), idx, idx_ptr), shape=(n, n)) with open(out_file, 'wb') as f: f.write(' '.join(it.islice((x[1] for x in csv.reader(open(in_file, 'rb'), delimiter='\t')), n)) + '\n') with open(out_file, 'ab') as f: np.savetxt(f, (A + triu(A, 1).transpose()).data, fmt='%.16f') return A
def test_SBM_from_csparse1(): from siconos.numerics import SBM_from_csparse,SBM_get_value, SBM_new_from_file, SBM_print, SBM_to_sparse from scipy.sparse import csr_matrix, lil_matrix A = lil_matrix((100, 100)) A.setdiag(range(100)) A[0, :10] = range(10) A[1, 10:20] = A[0, :10] M = csr_matrix(A) v,SBM=SBM_from_csparse(2,M) for i in range(M.shape[0]): for j in range(M.shape[1]): assert abs(SBM_get_value(SBM,i,j) - M[i,j]) < eps
def test_sparseToSBM1(): from siconos.numerics import sparseToSBM,getValueSBM, newFromFileSBM, printSBM, SBMtoSparse from scipy.sparse import csr_matrix, lil_matrix A = lil_matrix((100, 100)) A.setdiag(range(100)) A[0, :10] = range(10) A[1, 10:20] = A[0, :10] M = csr_matrix(A) v,SBM=sparseToSBM(2,M) for i in range(M.shape[0]): for j in range(M.shape[1]): assert abs(getValueSBM(SBM,i,j) - M[i,j]) < eps
def fit_transform(self, raw_documents, dsmMatrix, vocabulary): """use the sum of DSM vectors of a document's words as that document vector. here dsmMatrix is the matrix used in DSM that contains the distributional representations of all vocabulary words. """ x = None i = 0 num_exception = 0 docnumber = 0 docMatrices = [] oov = 0 oovDoc = 0 for doc in raw_documents: if docnumber % 1000 == 0: print "processing document number " + str(docnumber + 1) docnumber += 1 if oov > docnumber: print "warning: " + str(oov) + " oov." docMatrix = None sequenceVectors = [] words = doc.split() numAddedWords = 0 for word in words: wordId = vocabulary.getindex(word) if not wordId: oov += 1 continue # doc matrix is the sum of all its word vectors wordRepresentation = dsmMatrix.getSparseRow(wordId) sequenceVectors.append(wordRepresentation) numAddedWords += 1 if numAddedWords == 20: break if numAddedWords == 0: oovDoc += 1 # if text is shorter than 20 words add some extra zero vectors while numAddedWords < 20: zeroSparse = csr_matrix((1, dsmMatrix.matrix.shape[1])) sequenceVectors.append(zeroSparse) numAddedWords += 1 # hstack word vectors in sequence docMatrix = hstack(sequenceVectors) docMatrices.append(docMatrix) print "%d documents are completely out of vocabulary" % oovDoc print "vstacking matrices..." x = vstack(docMatrices) print "vstacking finished." return x
def train(self, X, mean=None): """ .. todo:: WRITEME """ warnings.warn('You should probably be using SparseMatPCA, ' 'unless your design matrix fits in memory.') n, d = X.shape # Can't subtract a sparse vector from a sparse matrix, apparently, # so here I repeat the vector to construct a matrix. mean = X.mean(axis=0) mean_matrix = csr_matrix(mean.repeat(n).reshape((d, n))).T X = X - mean_matrix super(SparsePCA, self).train(X, mean=numpy.asarray(mean).squeeze())
def __init__( self , size , dim , m=np.array([]) , Consts=1.0 , f_inter=None ): super( LinearSpringConstrained , self ).__init__( size , dim , m , Consts , f_inter=f_inter ) self.__dim = dim self.__size = size self.__K = Consts self.__A = np.zeros( ( size , dim ) ) self.__F = np.zeros( ( size , dim ) ) self.__Fm = dok.dok_matrix( ( size , size ) ) self.__Fm2 = csr.csr_matrix( ( size , size ) ) self.__M = np.zeros( ( size , 1 ) ) if len(m) != 0 : self.set_masses( m )
def call_svd(ndarray_matrix, low_dims, logger, normalize=False): assert isinstance(logger, logging.Logger) assert isinstance(ndarray_matrix, numpy.ndarray) assert isinstance(low_dims, int) if normalize == True: processed_matrix = normalize_data(ndarray_matrix) else: processed_matrix = ndarray_matrix X = csr_matrix(processed_matrix) logger.info(u"original dims: {}".format(X.shape[1])) svd = TruncatedSVD(n_components=low_dims, random_state=0) X_input = svd.fit_transform(X) logger.info(u"after SVD dims: {}".format(X_input.shape[1])) return X_input
def predict(self, X): """ Predicts the classes for the samples. Takes the top k classes with smallest distance. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Prediction vector, where n_samples in the number of samples and n_features is the number of features. """ predictions = csr_matrix((X.shape[0], self.y.shape[1]), dtype=np.int) topNIndices, _ = self._get_closest_centroids(X) for entry, label_list in enumerate(topNIndices): for label in label_list: predictions[entry, label] = 1 return predictions
def fit_transform(self, raw_documents, dsmMatrix, vocabulary): """use the sum of DSM vectors of a document's words as that document vector. here dsmMatrix is the matrix used in DSM that contains the distributional representations of all vocabulary words. """ x = None i = 0 num_exception = 0 docnumber = 0 docMatrices = [] oov = 0 oovDoc = 0 for doc in raw_documents: if docnumber % 1000 == 0: print "processing document number " + str(docnumber + 1) docnumber += 1 if oov > docnumber * 5: print "warning: " + str(oov) + " oov." docMatrix = None words = doc.split() for word in words: wordId = vocabulary.getindex(word) if not wordId: oov += 1 continue # doc matrix is the sum of all its word vectors if docMatrix is not None: docMatrix = docMatrix + dsmMatrix.getSparseRow(wordId) else: docMatrix = dsmMatrix.getSparseRow(wordId) if docMatrix is None: print dsmMatrix.matrix.shape print "%d is the shape of dsmMatrix" % dsmMatrix.matrix.shape[1] docMatrix = csr_matrix((1, dsmMatrix.matrix.shape[1])) docMatrix[0, 0] = 0 oovDoc += 1 docMatrices.append(docMatrix) print "%d documents are completely out of vocabulary" % oovDoc print "vstacking matrices..." x = vstack(docMatrices) print "vstacking finished." return x