def generate_sparse(qa_queue, length, l_matrix_fname, r_matrix_fname): logging.info("Start consumer") Qs = None As = None count = 0 while True: try: Qs_temp, As_temp = qa_queue.get(timeout=120) count += Qs_temp.shape[0] if Qs is None: Qs = Qs_temp As = As_temp else: Qs = sparse_vstack((Qs, Qs_temp)) As = sparse_vstack((As, As_temp)) if count == length: raise Empty if count % INF_FREQ == 0: logging.info("loading: %d/%d, %.2f%%" % (count, length, count / length * 100)) except Empty: logging.info("loading: %d/%d, %.2f%%" % (count, length, count / length * 100)) break logging.info("Stop consumer") with open(l_matrix_fname, 'wb') as f: pkl.dump(Qs, f, protocol=4) with open(r_matrix_fname, 'wb') as f: pkl.dump(As, f, protocol=4)
def _load_pdbbind_desc(self, desc_path, pdbbind_version=2016, train_set='refined', test_set='core', train_blacklist=None, fold_size=None): """ TODO: write the docs """ df = pd.read_csv(desc_path, index_col='pdbid') # generate dense representation of sparse descriptor in CSV cols = list(map(str, range(len(self.descriptor_generator)))) if 'sparse' in df.columns: # convert strings to np.arrays df['sparse'] = df['sparse'].map( lambda x: np.fromstring(x[1:-1], dtype=np.uint64, sep=',')) cols = 'sparse' # sparse array will have one column # fold only if necessary if fold_size: df['sparse'] = df['sparse'].map(lambda x: fold(x, fold_size)) # convert to sparse csr_matrix df['sparse'] = df['sparse'].map( partial(sparse_to_csr_matrix, size=len(self.descriptor_generator))) if isinstance(train_set, six.string_types): train_idx = df['%i_%s' % (pdbbind_version, train_set)] else: train_idx = df[['%i_%s' % (pdbbind_version, s) for s in train_set]].any(axis=1) if train_blacklist: train_idx &= ~df.index.isin(train_blacklist) train_idx &= ~df['%i_%s' % (pdbbind_version, test_set)] # load sparse matrices as training is usually faster on them if 'sparse' in df.columns: self.train_descs = sparse_vstack(df.loc[train_idx, cols].values, format='csr') else: self.train_descs = df.loc[train_idx, cols].values self.train_target = df.loc[train_idx, 'act'].values test_idx = df['%i_%s' % (pdbbind_version, test_set)] if 'sparse' in df.columns: self.test_descs = sparse_vstack(df.loc[test_idx, cols].values, format='csr') else: self.test_descs = df.loc[test_idx, cols].values self.test_target = df.loc[test_idx, 'act'].values
def ZfromN(normals, mask, Mx, My): """ Compute (integrate) the depth map of a normal map. The reconstruction is up to a scaling factor. """ b = -normals b[:,2] = 0 b = b.T.ravel() N = normals.shape[0] ij = list(range(N)) X = coo_matrix((normals[:,0], (ij, ij)), shape=Mx.shape) Y = coo_matrix((normals[:,1], (ij, ij)), shape=Mx.shape) Z = coo_matrix((normals[:,2], (ij, ij)), shape=Mx.shape) A = sparse_vstack((Z.dot(Mx), Z.dot(My), Y.dot(Mx) - X.dot(My))) # Is the 3rd constraint really useful? surf = sparse_lsqr(A, b) surf = surf[0] surf -= surf.min() out = np.zeros(mask.shape, np.float32) out[mask] = surf.ravel() return out
def build(self, ligands, protein=None): """Builds descriptors for series of ligands Parameters ---------- ligands: iterable of oddt.toolkit.Molecules or oddt.toolkit.Molecule A list or iterable of ligands to build the descriptor or a single molecule. protein: oddt.toolkit.Molecule or None (default=None) Default protein to use as reference """ if protein: self.protein = protein if is_molecule(ligands): ligands = [ligands] out = [] for mol in ligands: if self.protein is None: out.append(self.func(mol)) else: out.append(self.func(mol, protein=self.protein)) if self.sparse: # out = list(map(partial(sparse_to_csr_matrix, size=self.shape), out)) return sparse_vstack(map( partial(sparse_to_csr_matrix, size=self.shape), out), format='csr') else: return np.vstack(out)
def test_sparse_densify(): """FP densify""" sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023] # count vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(sparse_fp, resparsed) assert_array_equal(sparse_fp, resparsed_csr) # bool vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(np.unique(sparse_fp), resparsed) assert_array_equal(np.unique(sparse_fp), resparsed_csr) # test stacking np.random.seed(0) sparse_fps = np.random.randint(0, 1024, size=(20, 100)) dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps) csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps) assert_array_equal(dense, csr.toarray()) # test exceptions with pytest.raises(ValueError): csr_matrix_to_sparse(np.array([1, 2, 3]))
def test_sparse_densify(): """FP densify""" sparse_fp = [ 0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023 ] # count vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) assert_array_equal(sparse_fp, resparsed) # bool vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) assert_array_equal(np.unique(sparse_fp), resparsed) # test stacking np.random.seed(0) sparse_fps = np.random.randint(0, 1024, size=(20, 100)) dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps) csr = sparse_vstack( sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps) assert_array_equal(dense, csr.toarray())
def _run_interface(self, runtime): from scipy.sparse import vstack as sparse_vstack # Calculate the physical coordinates of target grid targetnii = nb.load(self.inputs.in_target) allmask = np.ones_like(targetnii.dataobj, dtype="uint8") weights = [] coeffs = [] for cname in self.inputs.in_coeff: coeff_nii = nb.load(cname) wmat = grid_bspline_weights(targetnii, coeff_nii) weights.append(wmat) coeffs.append(coeff_nii.get_fdata(dtype="float32").reshape(-1)) data = np.zeros(targetnii.shape, dtype="float32") data[allmask == 1] = np.squeeze( np.vstack(coeffs).T) @ sparse_vstack(weights) hdr = targetnii.header.copy() hdr.set_data_dtype("float32") self._results["out_field"] = fname_presuffix(self.inputs.in_target, suffix="_field", newpath=runtime.cwd) targetnii.__class__(data, targetnii.affine, hdr).to_filename(self._results["out_field"]) # Generate warp field phaseEncDim = "ijk".index(self.inputs.pe_dir[0]) phaseEncSign = [1.0, -1.0][len(self.inputs.pe_dir) != 2] data *= phaseEncSign * self.inputs.ro_time fieldshape = tuple(list(data.shape[:3]) + [3]) self._results["out_warp"] = fname_presuffix(self.inputs.in_target, suffix="_xfm", newpath=runtime.cwd) # Compose a vector field field = np.zeros((data.size, 3), dtype="float32") field[..., phaseEncDim] = data.reshape(-1) aff = targetnii.affine.copy() aff[:3, 3] = 0.0 # Multiplying by the affine implicitly applies the voxel size to the shift map field = nb.affines.apply_affine(aff, field).reshape(fieldshape) warpnii = targetnii.__class__( field[:, :, :, np.newaxis, :].astype("float32"), targetnii.affine, None) warpnii.header.set_intent("vector", (), "") warpnii.header.set_xyzt_units("mm") warpnii.to_filename(self._results["out_warp"]) return runtime
def fit(self, spatialimage): r""" Generate the interpolation matrix (and the VSM with it). Implements Eq. :math:`\eqref{eq:1}`, interpolating :math:`f(\mathbf{s})` for all voxels in the target-image's extent. Returns ------- updated : :obj:`bool` ``True`` if the internal field representation was fit, ``False`` if cache was valid and will be reused. """ # Calculate the physical coordinates of target grid if isinstance(spatialimage, (str, bytes, Path)): spatialimage = nb.load(spatialimage) if self.shifts is not None: newaff = spatialimage.affine newshape = spatialimage.shape if np.all(newshape == self.shifts.shape) and np.allclose( newaff, self.shifts.affine): return False weights = [] coeffs = [] # Generate tensor-product B-Spline weights for level in listify(self.coeffs): self.xfm.reference = spatialimage moved_cs = level.__class__(level.dataobj, self.xfm.matrix @ level.affine, level.header) wmat = grid_bspline_weights(spatialimage, moved_cs) weights.append(wmat) coeffs.append(level.get_fdata(dtype="float32").reshape(-1)) # Interpolate the VSM (voxel-shift map) vsm = np.zeros(spatialimage.shape[:3], dtype="float32") vsm = ( np.squeeze(np.hstack(coeffs).T) @ sparse_vstack(weights)).reshape( vsm.shape) # Cache self.shifts = nb.Nifti1Image(vsm, spatialimage.affine, None) self.shifts.header.set_intent("estimate", name="Voxel shift") self.shifts.header.set_xyzt_units("mm") return True
def compute_feature_matrix(df, vectorizer, combine=None): fq1 = vectorizer.transform(df.ix[:, Fields.question1]) fq2 = vectorizer.transform(df.ix[:, Fields.question2]) combine = combine or 'diff' if combine == 'stack': return sparse_vstack([fq1, fq2]) if combine == 'intersect': return fq1.multiply(fq2) if combine == 'diff': return abs(fq1 - fq2).tocsr()
def get_upsampling(train_df, count_threshold, tfidf_vector): train_df_copy = train_df.copy().reset_index( drop=True) # needed for tfidf_vector.getrow(index) dfs_by_target = [] max_df_size = 0 sizes = train_df_copy.target.value_counts().to_dict() for target in train_df_copy.target.unique(): df = train_df_copy[train_df_copy['target'] == target] if sizes[target] >= count_threshold: dfs_by_target.append(df) else: dfs_by_target.append(df.sample(max(sizes.values()), replace=True)) X_train = sparse_vstack( [tfidf_vector.getrow(i) for df in dfs_by_target for i in df.index]) y_train = pd.concat(dfs_by_target, axis=0).target return X_train, y_train
def restoreMaskedBins(self): """ Puts backs into the matrix the bins removed """ if len(self.orig_bin_ids) == 0: return # the rows to add are # as an empty sparse matrix M = self.matrix.shape[0] N = len(self.orig_bin_ids) - M rows_mat = csr_matrix((N, M)) # cols to add cols_mat = csr_matrix((M + N, N)) # add the rows and cols at the end of the # current matrix self.matrix = sparse_vstack([self.matrix, rows_mat]) self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr') # the new matrix has the right number of cols and rows, now # they need to be reordered to be back in their original places rows = cols = np.argsort(self.orig_bin_ids) self.matrix = self.matrix[rows, :][:, cols] self.cut_intervals = [self.orig_cut_intervals[x] for x in rows] self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) # set as nan_bins the masked bins that were restored self.nan_bins = self.orig_bin_ids[M:] if self.correction_factors is not None: # add missing values as nans at end of array self.correction_factors = np.concatenate( [self.correction_factors, np.repeat(np.nan, N)]) # reorder array self.correction_factors = self.correction_factors[rows] # reset orig bins ids and cut intervals self.orig_bin_ids = [] self.orig_cut_intervals = [] log.info("masked bins were restored\n")
def testMultiModel(X, y, numModels): activeIndexTuple = y.nonzero() activeIndexValues = activeIndexTuple[0] activeTotalCount = activeIndexValues.shape[0] X_active = X[activeIndexValues, :] fs = frozenset(activeIndexValues) allIndices = [k for k in range(len(y))] nonActiveIndices = list(filter(lambda q: q not in fs, allIndices)) nonActiveIndexValues = np.array(nonActiveIndices, dtype=np.int64) X_nonActive = X[nonActiveIndexValues, :] modelRangeList = getRangeList(len(nonActiveIndices), numModels) returnList = [] for modelIndex in range(numModels): currentZerosList = modelRangeList[modelIndex] currentZerosArray = np.array(currentZerosList, dtype=np.int64) X_nonActiveCurrent = X_nonActive[currentZerosArray, :] #X_model = np.append(X_active, X_nonActiveCurrent) X_model = sparse_vstack([X_active, X_nonActiveCurrent]).tolil() y_model = [1] * X_active.shape[0] y_model.extend([0] * X_nonActiveCurrent.shape[0]) print("Sub model X = " + str(X_model.shape)) print("Sub model y = " + str(len(y_model))) print("Constructing model #" + str(modelIndex)) returnList.append((X_model, y_model)) return returnList
def cross_validation_fold(index, splits_in, splits_out): """ k-fold cross-validation "fold": performs validation using exactly one of the splits as validation set and the rest of the dataset as training data. :param index: Index of the split to use as validation data :param splits_in: List of splits of the original dataset inputs :param splits_out: List of splits of the origina dataset outputs :return: The accuracy score for a LinearSVC trained on all the splits except <index> and then validated on split <index> """ validation_in = splits_in[index] validation_out = splits_out[index] cf = LabelPowerset(LinearSVC()) # train on all splits except split <index> cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]), sparse_vstack(splits_out[:index] + splits_out[index + 1:])) # validate on split <index> return validate(cf, validation_in, validation_out, return_predictions=False)
def _run_interface(self, runtime): from sklearn import linear_model as lm from scipy.sparse import vstack as sparse_vstack # Load in the fieldmap fmapnii = nb.load(self.inputs.in_data) data = fmapnii.get_fdata(dtype="float32") mask = ( nb.load(self.inputs.in_mask).get_fdata() > 0 if isdefined(self.inputs.in_mask) else np.ones_like(data, dtype=bool) ) bs_spacing = [np.array(sp, dtype="float32") for sp in self.inputs.bs_spacing] # Recenter the fieldmap if self.inputs.recenter == "mode": from scipy.stats import mode data -= mode(data[mask], axis=None)[0][0] elif self.inputs.recenter == "median": data -= np.median(data[mask]) elif self.inputs.recenter == "mean": data -= np.mean(data[mask]) # Calculate the spatial location of control points bs_levels = [] ncoeff = [] weights = None for sp in bs_spacing: level = bspline_grid(fmapnii, control_zooms_mm=sp) bs_levels.append(level) ncoeff.append(level.dataobj.size) weights = ( gbsw(fmapnii, level) if weights is None else sparse_vstack((weights, gbsw(fmapnii, level))) ) regressors = weights.T.tocsr()[mask.reshape(-1), :] # Fit the model model = lm.Ridge(alpha=self.inputs.ridge_alpha, fit_intercept=False) model.fit(regressors, data[mask]) interp_data = np.zeros_like(data) interp_data[mask] = np.array(model.coef_) @ regressors.T # Interpolation # Store outputs out_name = fname_presuffix( self.inputs.in_data, suffix="_field", newpath=runtime.cwd ) hdr = fmapnii.header.copy() hdr.set_data_dtype("float32") fmapnii.__class__(interp_data, fmapnii.affine, hdr).to_filename(out_name) self._results["out_field"] = out_name index = 0 self._results["out_coeff"] = [] for i, (n, bsl) in enumerate(zip(ncoeff, bs_levels)): out_level = out_name.replace("_field.", f"_coeff{i:03}.") bsl.__class__( np.array(model.coef_, dtype="float32")[index : index + n].reshape( bsl.shape ), bsl.affine, bsl.header, ).to_filename(out_level) index += n self._results["out_coeff"].append(out_level) # Write out fitting-error map self._results["out_error"] = out_name.replace("_field.", "_error.") fmapnii.__class__( data * mask - interp_data, fmapnii.affine, fmapnii.header ).to_filename(self._results["out_error"]) if not self.inputs.extrapolate: return runtime if np.all(mask): self._results["out_extrapolated"] = self._results["out_field"] return runtime extrapolators = weights.tocsc()[:, ~mask.reshape(-1)] interp_data[~mask] = np.array(model.coef_) @ extrapolators # Extrapolation self._results["out_extrapolated"] = out_name.replace("_field.", "_extra.") fmapnii.__class__(interp_data, fmapnii.affine, hdr).to_filename( self._results["out_extrapolated"] ) return runtime
def _generate_feats(self, data, mode): # lexical feats #if mode == "train": # self.tfidf_vect = TfidfVectorizer(ngram_range = self.ngram_rng, min_df = self.min_df, use_idf = self.use_idf) # self.tfidf_vect.fit([x[1:-1] for x in list(data.text)]) # the x[1:-1] strips the initial and final [ and ] from the texts #feats = self.tfidf_vect.transform([x[1:-1] for x in list(data.text)]) feats = self.transformer_model.encode( [x[1:-1] for x in list(data.text)]) feats = np.array(feats) if self.use_utterance_feats: # utterance feats ut_feats = np.zeros((data.shape[0], 3)) current_mid = data.iloc[0, 8] current_max_timestamp = max( data[data.meeting_id == current_mid].timestamp) for i in range(data.shape[0]): text = data.iloc[i, 2][1:-1] timestamp = data.iloc[i, 1] next_timestamp = data.iloc[i + 1, 1] if ( i + 1 < data.shape[0] and data.iloc[i + 1, 8] == data.iloc[i, 8]) else None # first condition is for the end of the data frame (last utterance of the last meeting) second is on a breaking point between two meetings (happens for last utterance of every meeting) # without the second we would get 1853.2 as the last timestamp of meeting X and 0.0 as the first in meeting Y and the difference would be negative which messes up things down the line ut_feats[i, 0] = len(text.split(" ")) # length in words ut_feats[ i, 1] = next_timestamp - timestamp if next_timestamp is not None else 2.0 # 2.0 is just an arbitray approximate value for the duration of the last utterance of each meeting ut_feats[i, 2] = timestamp / current_max_timestamp if next_timestamp is None and i + 1 < data.shape[ 0]: # this is a breaking point between meetings and we have to update some of the vals for the next iteration current_mid = data.iloc[i + 1, 8] current_max_timestamp = max( data[data.meeting_id == current_mid].timestamp) feats = csr_matrix(sparse_hstack([feats, csr_matrix(ut_feats)])) # expand all utterance level feats to include feats of the prev and next utterances prev_context_feat_mats, next_context_feat_mats = [], [] # prev context for offset in range(1, self.prev_context_len + 1): context_feats = feats[:-offset, :] padding = csr_matrix(np.zeros((offset, feats.shape[1]))) final = sparse_vstack((padding, context_feats)) prev_context_feat_mats.append(final) # next context for offset in range(1, self.next_context_len + 1): context_feats = feats[offset:, :] padding = csr_matrix(np.zeros((offset, feats.shape[1]))) final = sparse_vstack((context_feats, padding)) next_context_feat_mats.append(final) #feats = sparse_hstack([feats] + prev_context_feat_mats + next_context_feat_mats) if self.do_scaling: if mode == "train": self.scaler = StandardScaler(with_mean=False) self.scaler.fit(feats) feats = self.scaler.transform(feats) return feats
def restoreMaskedBins(self): """ Puts backs into the matrix the bins removed Examples -------- >>> from scipy.sparse import coo_matrix >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] >>> hic = hiCMatrix() >>> hic.nan_bins = [] >>> matrix = np.array([ ... [ 0, 10, 5, 3, 0], ... [ 0, 0, 15, 5, 1], ... [ 0, 0, 0, 7, 3], ... [ 0, 0, 0, 0, 1], ... [ 0, 0, 0, 0, 0]], dtype=np.int32) make the matrix symmetric: >>> hic.matrix = csr_matrix(matrix + matrix.T) >>> hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals) Add masked bins masked bins >>> hic.maskBins([3]) >>> hic.matrix.todense() matrix([[ 0, 10, 5, 0], [10, 0, 15, 1], [ 5, 15, 0, 3], [ 0, 1, 3, 0]], dtype=int32) >>> hic.cut_intervals [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] >>> hic.restoreMaskedBins() >>> hic.matrix.todense() matrix([[ 0., 10., 5., 0., 0.], [10., 0., 15., 0., 1.], [ 5., 15., 0., 0., 3.], [ 0., 0., 0., 0., 0.], [ 0., 1., 3., 0., 0.]]) >>> hic.cut_intervals [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] """ if len(self.orig_bin_ids) == 0: return # the rows to add are # as an empty sparse matrix M = self.matrix.shape[0] N = len(self.orig_bin_ids) - M rows_mat = csr_matrix((N, M)) # cols to add cols_mat = csr_matrix((M + N, N)) # add the rows and cols at the end of the # current matrix self.matrix = sparse_vstack([self.matrix, rows_mat]) self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr') # the new matrix has the right number of cols and rows, now # they need to be reordered to be back in their original places rows = cols = np.argsort(self.orig_bin_ids) self.matrix = self.matrix[rows, :][:, cols] self.cut_intervals = [self.orig_cut_intervals[x] for x in rows] self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) # set as nan_bins the masked bins that were restored self.nan_bins = self.orig_bin_ids[M:] if self.correction_factors is not None: # add missing values as nans at end of array self.correction_factors = np.concatenate( [self.correction_factors, np.repeat(np.nan, N)]) # reorder array self.correction_factors = self.correction_factors[rows] # reset orig bins ids and cut intervals self.orig_bin_ids = [] self.orig_cut_intervals = [] log.info("masked bins were restored\n")
lsi_model = gensim.models.LsiModel.load('exports/lsi.model') lsi_releasenotes_vecs = np.zeros((1,num_topics)) lsi_reviews_vecs = np.zeros((1,num_topics)) count = 0 for doc in releasenotes: if gensim_tfidf[doc.id - 3717281]: lda_releasenotes_vecs = np.vstack((lda_releasenotes_vecs, gensim.matutils.corpus2dense(lda_model[[gensim_tfidf[doc.id-3717281]]],num_topics).T)) else: lda_releasenotes_vecs = np.vstack((lda_releasenotes_vecs,np.zeros((1,num_topics)))) lsi_releasenotes_vecs = np.vstack((lsi_releasenotes_vecs, gensim.matutils.corpus2dense(lsi_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T)) if count == 0: tfidf_releasenotes_vecs = tfidf[doc.id - 3717281] else: tfidf_releasenotes_vecs = sparse_vstack((tfidf_releasenotes_vecs, tfidf[doc.id - 3717281])) count += 1 lda_releasenotes_vecs = np.delete(lda_releasenotes_vecs,0,0) lsi_releasenotes_vecs = np.delete(lsi_releasenotes_vecs,0,0) count = 0 for doc in reviews: if gensim_tfidf[doc.id - 3717281]: lda_reviews_vecs = np.vstack((lda_reviews_vecs, gensim.matutils.corpus2dense(lda_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T)) # we could send the whole corpus of this app_id to the model in one step else: lda_reviews_vecs = np.vstack((lda_reviews_vecs, np.zeros((1, num_topics)))) lsi_reviews_vecs = np.vstack((lsi_reviews_vecs, gensim.matutils.corpus2dense(lsi_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T)) if count == 0: tfidf_reviews_vecs = tfidf[doc.id - 3717281]
def vstack(vectors): if type(vectors[0]) == scipy.sparse.coo.coo_matrix: return sparse_vstack(vectors) else: return np.vstack(vectors)