def tfidf_nmf(release_texts, n_components=10, max_features=None): ''' Creates and fits tfidf and NMF models. INPUT: - n_components: number of latent features for the NMF model to find - max_features: max number of features (vocabulary size) for the tfidf model to consider OUTPUT: - tfidf_vectorizer: tfidf model object - tfidf_sparse:tfidf sparse matrix - nmf: NMF model object - W: Feature matrix output from NMF factorization into W and H matrices ''' # tfidf model custom_stop_words = make_stop_words() tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features) tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts) # normalize row-wise so each row sums to one tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1') # nmf model nmf = NMF(n_components=n_components, random_state=1) nmf.fit(tfidf_sparse) W = nmf.transform(tfidf_sparse) return tfidf_vectorizer, tfidf_sparse, nmf, W
def produceEncoding( trainX, nComponents ): '''Produces an NMF encoding from the training data matrix''' model = NMF( n_components=nComponents, solver='cd', \ tol=1e-4, max_iter=200, alpha=0.0 ) model.fit( trainX ) return model
def __Factorize_NMF(self,K): model = NMF(n_components=K,max_iter=self._iteration) model.fit(self._mat) user_fmat = model.fit_transform(self._mat) item_fmat = model.components_.T return user_fmat,item_fmat
def plot_nmf_illustration(): rnd = np.random.RandomState(5) X_ = rnd.normal(size=(300, 2)) # Add 8 to make sure every point lies in the positive part of the space X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8 nmf = NMF(random_state=0) nmf.fit(X_blob) X_nmf = nmf.transform(X_blob) fig, axes = plt.subplots(1, 2, figsize=(15, 5)) axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[0].set_xlabel("feature 1") axes[0].set_ylabel("feature 2") axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1, head_width=.3, color='k') axes[0].set_aspect('equal') axes[0].set_title("NMF with two components") # second plot nmf = NMF(random_state=0, n_components=1) nmf.fit(X_blob) axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[1].set_xlabel("feature 1") axes[1].set_ylabel("feature 2") axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[1].set_aspect('equal') axes[1].set_title("NMF with one component")
def get_topics_nmf(urls, num_topics): '''Input: URL containing links to each document (pdf) in the corpus (i.e. arxiv) Output: the num_topics most important latent topics from the corpus (via NMF) ''' article_info = [] for url in urls: article_info.append(get_text(url)) text = [] for thing in article_info: text.extend(thing[0]) text = clean_pdf_text(text) tfidf_math = TfidfVectorizer(max_features=100, stop_words=math_stop(), ngram_range=(1, 1), decode_error='ignore') M = tfidf_math.fit_transform(text) feature_names = tfidf_math.get_feature_names() feature_names = [WordNetLemmatizer().lemmatize(word) for word in feature_names] nmf = NMF(n_components=num_topics) nmf.fit(M) topics = [] for topic_idx, topic in enumerate(nmf.components_): topics.append((" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))) return M, topics, text, title_list, urls
def fit_nmf(tfidf): '''takes in a tfidf sparse vector and finds the top topics''' nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5) nmf.fit(tfidf) tfidf_feature_names = tfidf_vectorizer.get_feature_names() nmf_topic_dict = print_top_words(nmf, tfidf_feature_names, n_top_words) return nmf, nmf_topic_dict
class NMFReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = NMF(n_components=num_components, max_iter=5000) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def display_reduced_iris(self): sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
def NMF_train(X_train, X_test, n): nmf_model = NMF(n_components=n) nmf_model.fit(X_train) H = nmf_model.components_; W = nmf_model.fit_transform(X_train) W_test = nmf_model.transform(X_test) return H, W, W_test
def nmf_error(eta): ''' Decompose the ETAs using nonnegative matrix factorization with 1 component. The reconstruction error is an estimate of the non-inspiratory activity. ''' nmf = NMF(n_components=1) nmf.fit(eta) reconstruction_err = nmf.reconstruction_err_ return float(reconstruction_err)
def get_factorization(V, num_roles): """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """ model = NMF(n_components=num_roles, init='random', random_state=0) model.fit(V) node_roles = model.transform(V) role_features = model.components_ return np.matrix(node_roles), np.matrix(role_features)
def performNMF(M, fragmentsLookupTable, fragmentsCount): if (args.verbose): print >> sys.stdout, "- %s START : calculating NMF" % (timeStamp()) t0 = time() model = NMF( n_components=args.components, init='nndsvd', beta=10000.0, max_iter=1000, tol=5e-3, sparseness='components') model.fit(M) train_time = (time() - t0) components_ = model.components_ # print >> sys.stdout, components_ N = model.transform(M) if (args.verbose): print >> sys.stdout, "- %s FINISH : calculating NMF" % (timeStamp()) if (args.verbose): print >> sys.stdout, "- %s START : mapping components" % ( timeStamp()) # convert components into locations for i in xrange(args.components): output = gzip.open( args.outdir + "/NMF_component_" + str(i) + ".txt.gz", 'wb') if (args.verbose): print >> sys.stdout, "- : processing component %d" % (i) try: for j in xrange(model.components_[i].shape[0]): # print >> sys.stdout, model.components_[i] # print >> sys.stdout, "Max value %f" (numpy.max(model.components_[i])) # if (model.components_[i][j] != 0): fragment1 = j / fragmentsCount fragment2 = j % fragmentsCount (chr1, midpoint1) = fragmentsLookupTable[fragment1] (chr2, midpoint2) = fragmentsLookupTable[fragment2] output.write( "%s\t%i\t%s\t%i\t%f\n" % (chr1, midpoint1, chr2, midpoint2, model.components_[i][j])) finally: output.close() if (args.verbose): print >> sys.stdout, "- %s FINISH : mapping components" % ( timeStamp()) return (N, model)
def fit(self): nmf = NMF(**self.fit_parameters) nmf.fit(self.input_data) self.output_data = nmf.transform(self.input_data) self.mapper_data = nmf.components_ self.model_attributes = {"n_topics": nmf.n_components, } self._log_model_results() return self
def _get(self, index, block, shape): offset = (asarray(index[1]) * asarray(shape))[1:] dims = block.shape[1:] max_size = prod(dims) / 2 if self.max_size == 'full' else self.max_size # reshape to t x spatial dimensions data = block.reshape(block.shape[0], -1) # build and apply NMF model to block model = SKNMF(self.k, max_iter=self.max_iter) model.fit(clip(data, 0, inf)) # reconstruct sources as spatial objects in one array components = model.components_.reshape((self.k,) + dims) # convert from basis functions into shape # by median filtering (optional), applying a percentile threshold, # finding connected components and removing small objects combined = [] for component in components: tmp = component > percentile(component, self.percentile) regions = remove_small_objects(label(tmp), min_size=self.min_size) ids = unique(regions) ids = ids[ids > 0] for ii in ids: r = regions == ii r = median_filter(r, 2) coords = asarray(where(r)).T + offset if (size(coords) > 0) and (size(coords) < max_size): combined.append(one(coords)) # merge overlapping sources if self.overlap is not None: # iterate over source pairs and find a pair to merge def merge(sources): for i1, s1 in enumerate(sources): for i2, s2 in enumerate(sources[i1+1:]): if s1.overlap(s2) > self.overlap: return i1, i1 + 1 + i2 return None # merge pairs until none left to merge pair = merge(combined) testing = True while testing: if pair is None: testing = False else: combined[pair[0]] = combined[pair[0]].merge(combined[pair[1]]) del combined[pair[1]] pair = merge(combined) return combined
def get_latent_vector(X): # for language: n_components=150 # for repo: n_components=? model = NMF(n_components=150, init='nndsvd', max_iter=1000, random_state=1126) print('NMF', model) model.fit(X) W = model.transform(X) H = model.components_ normalized_matrix = normalize(W, axis=1, norm='l2') return normalized_matrix
def reduce_dimensions(total_mat, n_topics): """ Calculates and returns nmf Input is data matrix, shape (n_samples, n_features) returns W array, shape (n_samples, n_components) """ nmf = NMF(n_components = n_topics, random_state=42, alpha=.2, l1_ratio=0.5) nmf.fit(total_mat) X = nmf.transform(total_mat) w = nmf.components_ return nmf
class MatrixFactorization: def __init__(self): self.nmf = NMF() def fit(self, X): self.nmf.fit(X) u = self.nmf.transform(X) return u.dot(self.nmf.components_) def predict(self, X): u = self.nmf.transform(X) return u.dot(self.nmf.components_)
class DescriptionNMF(DescriptionVector): def __init__(self, dataset, n_topics=None, vocabulary_size=None): super(DescriptionNMF, self).__init__(dataset, vocabulary_size) self.n_topics = n_topics self.transformer = NMF(n_components=n_topics) self.transformer.fit(self.features) def get_components(self): words = self.vectorizer.get_feature_names() words = numpy.array(words, dtype=object) indices = numpy.argsort(-numpy.absolute(self.transformer.components_)) return words[indices]
def nmf(X, n, binary, d): name = "nmf" + str(n) if binary: name = name + "_binary_" + d print name model = NMF(n_components = n) model.fit(X) A = model.components_ A_T = A.transpose() Z = model.fit_transform(X) def get_prob(i, j): return np.sum((Z[i,:]*A[:,j])) display_results(name, get_prob, binary, d)
def learnNMFDict(features, components=25): from sklearn.decomposition import NMF nmfHOG = NMF(n_components=components) nmfHOF = NMF(n_components=components) nmfHOG.fit(np.array([x['hog'] for x in features]).T) nmfHOF.fit(np.array([x['hof'] for x in features]).T) hogComponents = icaHOG.components_.T hofComponents = icaHOF.components_.T return hogComponents, hofComponents
def nmf_new(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components, init='nndsvdar', random_state=0): # Numerical solver to use: ‘pg’ is a Projected Gradient solver (deprecated). # ‘cd’ is a Coordinate Descent solver (recommended). model = NMF(n_components=n_components, init=init, random_state=random_state) # TODO en boucle model.fit(mut_final) gene_comp = model.components_.copy() patient_strat = np.argmax(model.fit_transform(mut_final), axis=1).copy() # fit_transform more efficient than calling fit followed by transform model.fit(mut_diff) gene_comp_diff = model.components_.copy() patient_strat_diff = np.argmax( model.fit_transform(mut_diff), axis=1).copy() model.fit(mut_mean_qn) gene_comp_mean_qn = model.components_.copy() patient_strat_mean_qn = np.argmax( model.fit_transform(mut_mean_qn), axis=1).copy() model.fit(mut_median_qn) gene_comp_median_qn = model.components_.copy() patient_strat_median_qn = np.argmax( model.fit_transform(mut_median_qn), axis=1).copy() return (gene_comp, patient_strat, gene_comp_diff, patient_strat_diff, gene_comp_mean_qn, patient_strat_mean_qn, gene_comp_median_qn, patient_strat_median_qn)
def nmf_old(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components, init='nndsvdar', random_state=0): # fit followed by transform model = NMF(n_components=n_components, init=init, random_state=random_state) # TODO en boucle model.fit(mut_final) gene_comp = model.components_.copy() patient_strat = np.argmax(model.transform(mut_final), axis=1).copy() model.fit(mut_diff) gene_comp_diff = model.components_.copy() patient_strat_diff = np.argmax( model.transform(mut_diff), axis=1).copy() model.fit(mut_mean_qn) gene_comp_mean_qn = model.components_.copy() patient_strat_mean_qn = np.argmax( model.transform(mut_mean_qn), axis=1).copy() model.fit(mut_median_qn) gene_comp_median_qn = model.components_.copy() patient_strat_median_qn = np.argmax( model.transform(mut_median_qn), axis=1).copy() return (gene_comp, patient_strat, gene_comp_diff, patient_strat_diff, gene_comp_mean_qn, patient_strat_mean_qn, gene_comp_median_qn, patient_strat_median_qn)
def nmf_faces(X_train, X_test): # Build NMF models with 10, 50, 100 and 500 components # this list will hold the back-transformd test-data reduced_images = [] for n_components in [10, 50, 100, 500]: # build the NMF model nmf = NMF(n_components=n_components, random_state=0) nmf.fit(X_train) # transform the test data (afterwards has n_components many dimensions) X_test_nmf = nmf.transform(X_test) # back-transform the transformed test-data # (afterwards it's in the original space again) X_test_back = np.dot(X_test_nmf, nmf.components_) reduced_images.append(X_test_back) return reduced_images
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600) X = np.abs(rng.randn(6, 5)) assert_less(pnmf.fit(X).reconstruction_err_, 0.1)
def get_topics(n_components=10, n_top_words=15, print_output=True): custom_stop_words = make_stop_words(new_stop_words) tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words) tfidf = tfidf_vectorizer.fit_transform(release_texts) tfidf = row_normalize_tfidf(tfidf) nmf = NMF(n_components=n_components, random_state=1) # nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5) nmf.fit(tfidf) W = nmf.transform(tfidf) if print_output: print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) return tfidf, nmf, W
def nmf(x, n_topics): print("Non Negative Matrix Factorization (NMF), topics={}".format(n_topics)) nmf = NMF( n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5 ) x = to_ndarray(x) nmf_fitted = nmf.fit( x ) return nmf_fitted.transform(x), nmf_fitted
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF( 5, solver=solver, init="nndsvdar", random_state=0, max_iter=600, ) X = np.abs(rng.randn(6, 5)) assert pnmf.fit(X).reconstruction_err_ < 0.1
def compute_PCA_ICA_NMF(n_components=5): spec_mean = spectra.mean(0) # PCA: use randomized PCA for speed pca = RandomizedPCA(n_components - 1) pca.fit(spectra) pca_comp = np.vstack([spec_mean, pca.components_]) # ICA treats sequential observations as related. Because of this, we need # to fit with the transpose of the spectra ica = FastICA(n_components - 1) ica.fit(spectra.T) ica_comp = np.vstack([spec_mean, ica.transform(spectra.T).T]) # NMF requires all elements of the input to be greater than zero spectra[spectra < 0] = 0 nmf = NMF(n_components) nmf.fit(spectra) nmf_comp = nmf.components_ return pca_comp, ica_comp, nmf_comp
class NMFSpectrum(SparseApproxSpectrum): def __init__(self, **kwargs): SparseApproxSpectrum.__init__(self,**kwargs) def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print("NMF...") self.model = NMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self def reconstruct_spectrum(self, w=None, randomize=False): if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def plot_reconstruction_error(matrix, lower, upper): """Plotting function for reconstruction error of NMF models vs the number of components Parameters ---------- matrix : pivoted input matrix for NMF fitting lower : lower bound on number of components upper : upper bound on number of components Returns ------- saved figure """ nmf_results = [] for k in range(lower, upper + 1): model = NMF(n_components=k, init='random', random_state=0) model.fit(matrix) nmf_results.append((k, model.reconstruction_err_)) ax = plt.scatter(*zip(*nmf_results)) plt.xlabel('N Clusters') plt.ylabel('Reconstruction Error') plt.title('N Clusters Vs Associated Reconstruction Error') return ax
class NMFFeatures(object): def __init__(self, num_components=128, features=None): self.nmf = None self.feature_extractor = None self.num_components = num_components self.features = features def fit(self, X, y=None): self.nmf = NMF(n_components=self.num_components, max_iter=50, random_state=42) self.feature_extractor = TfidfVectorizer(tokenizer=utils.get_tokens, ngram_range=(1, 1), stop_words='english', vocabulary=self.features) transformed_features = self.feature_extractor.fit_transform(X) self.nmf.fit(transformed_features) def transform(self, X): transformed_features = self.feature_extractor.transform(X) return self.nmf.transform(transformed_features) def fit_transform(self, X, y=None): self.fit(X, y) return self.transform(X)
def segregate_topic(self, thresh=None): if thresh is None: d = {} tfidf = TfidfVectorizer(max_df=0.96, min_df=2, stop_words="english") x = tfidf.fit_transform(self.df["cleaned_text"]) nmf_model = NMF(n_components=self.topic, random_state=21) nmf_model.fit(x) for index, topic in enumerate(nmf_model.components_): d[index] = [ tfidf.get_feature_names()[i] for i in topic.argsort()[-20:] ] result = nmf_model.transform(x) y = result.argmax(axis=1) self.df["topic_label"] = y return (self.df, d) else: d = {} tfidf = TfidfVectorizer(max_df=0.96, min_df=2, stop_words="english") x = tfidf.fit_transform(self.df["cleaned_text"]) nmf_model = NMF(n_components=thresh, random_state=21) nmf_model.fit(x) for index, topic in enumerate(nmf_model.components_): d[index] = [ tfidf.get_feature_names()[i] for i in topic.argsort()[-20:] ] result = nmf_model.transform(x) y = result.argmax(axis=1) self.df["topic_label"] = y return (self.df, d)
def fit(self, num_factors=100, l1_ratio = 0.5, solver = "multiplicative_update", init_type = "random", beta_loss = "frobenius", verbose = False, random_seed = None): assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio) if solver not in self.SOLVER_VALUES: raise ValueError("Value for 'solver' not recognized. Acceptable values are {}, provided was '{}'".format(self.SOLVER_VALUES.keys(), solver)) if init_type not in self.INIT_VALUES: raise ValueError("Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'".format(self.INIT_VALUES, init_type)) if beta_loss not in self.BETA_LOSS_VALUES: raise ValueError("Value for 'beta_loss' not recognized. Acceptable values are {}, provided was '{}'".format(self.BETA_LOSS_VALUES, beta_loss)) start_time = time.time() self._print("Computing NMF decomposition...") nmf_solver = NMF(n_components = num_factors, init = init_type, solver = self.SOLVER_VALUES[solver], beta_loss = beta_loss, random_state = random_seed, l1_ratio = l1_ratio, shuffle = True, verbose = verbose, max_iter = 500) nmf_solver.fit(self.URM_train) self.ITEM_factors = nmf_solver.components_.copy().T self.USER_factors = nmf_solver.transform(self.URM_train) new_time_value, new_time_unit = seconds_to_biggest_unit(time.time()-start_time) self._print("Computing NMF decomposition... done in {:.2f} {}".format( new_time_value, new_time_unit))
def fit(self, data): """ fit searches for the background profile using NMF decomposition - (N, M, M) image series --> (N, M**2) flattened images - (N, M**2) = (N, n_components) @ (n_components, M**2) NMF decomposition - background: (n_components, M**2) --> (important_components, M**2) Parameters ---------- data : np.ndarray Input data (series of 2D images, 3D total) center : tuple (corner_x, corner_y) tuple n_components : int, optional n_components for dimensionality reduction, by default 5 important_components : int, optional number of components to account for, by default 1 Returns ------- np.ndarray Background profile """ X = data.reshape(data.shape[0], -1) nmf = NMF( n_components=self.n_components, ) nmf.fit(X) coeffs = nmf.transform(X) bg_full = nmf.components_[: self.important_components, :].reshape( (-1, *data.shape[1:]) ) # memorize scalefactors and background self._scales = coeffs[:, : self.important_components].reshape(1, -1) self._bg = bg_full return bg_full
def predict_new_input(mat, mv, mv_re, id_list, rate_list): # mv is the hashing dictionary n, m = mat.shape new_row = np.zeros(m).reshape((1, m)) for id, rate in zip(id_list, rate_list): new_row[0, mv_re[id]] = rate mat = np.append(mat, new_row, axis=0) n = n + 1 # check whether to discard the two lines below: known = [x for x in range(m) if mat[-1, x] != 0] desired = [ x for x in range(m) if mat[-1, x] == 0 ] # stores all the index of movies in the matrix that is to predict ratings predicts = dict() # key is movie_id and value is the predicted ratings T1 = mat[:, known] model = NMF(n_components=3, tol=0.005) # give a 5-component model if not specify model.fit(T1) W1 = model.fit_transform(T1) H1 = model.components_ #print model.reconstruction_err_ #print 'W1', W1[-1,:] for i in desired: # each time focus on a even smaller matrix with only one desired entry to predict cols = known cols.append(i) T2 = mat[:-1, cols] model2 = NMF( n_components=3, tol=0.005 ) # if do not specify n_components, there will be 23846 components model2.fit(T2) W2 = model2.fit_transform(T2) H2 = model2.components_ #print model2.reconstruction_err_ #print 'H2', H2[:,-1] predicts[mv[i]] = np.dot(H2[:, -1], W1[-1, :]) #print predicts return predicts
def denoise(self, data, center, radius=45): """ nmf_denoise performs NMF-decomposition based denoising - (N, M, M) image series --> (N, M**2) flattened images - (N, M**2) = (N, n_components) @ (n_components, M**2) NMF decomposition - background: (n_components, M**2) --> (important_components, M**2) - scales: (N, n_components) --> (N, important_components) - scaled_background = scales @ background - return arr - scaled_background Parameters ---------- data : np.ndarray Input data (series of 2D images, 3D total) center : tuple (corner_x, corner_y) tuple n_components : int, optional n_components for dimensionality reduction, by default 5 important_components : int, optional number of components to account for, by default 1 Returns ------- np.ndarray Denoised data """ img_shape = data.shape[1:] X = data.reshape(data.shape[0], -1) nmf = NMF(n_components=self.n_components) nmf.fit(X) coeffs = nmf.transform(X) bg_full = nmf.components_ bg_scaled = (coeffs[:, :self.important_components] @ bg_full[:self.important_components, :]).reshape( data.shape[0], *img_shape) return apply_mask(data - bg_scaled, radius=radius, center=center)
def get_NMF(fileData,normalized_axis=1,norm='l2'): data = np.load(fileData) n_components = np.size(np.unique(data['labels'])) features = data['data'] print features features = as_float_array(features) if normalized_axis != None: features = normalize(features,norm=norm,axis=normalized_axis) model = NMF(n_components = n_components) print n_components print model model.fit(np.transpose(features)) G = model.components_ print G labels_pred = np.zeros(features.shape[0]) for i in xrange(G.shape[1]): temp2= np.argmax(G[:,i]) labels_pred[i] = temp2 labels_true = data['labels'] return labels_true,labels_pred,features
def in36(): from sklearn.datasets import fetch_lfw_people people = fetch_lfw_people(min_faces_per_person=20, resize=0.7) mask = np.zeros(people.target.shape, dtype=np.bool) for target in np.unique(people.target): mask[np.where(people.target == target)[0][:50]] = 1 x_people = people.data[mask] y_people = people.target[mask] x_people = x_people / 255 from sklearn.neighbors import KNeighborsClassifier x_train, x_test, y_train, y_test = train_test_split(x_people, y_people, stratify=y_people, random_state=0) from sklearn.decomposition import NMF nmf = NMF(n_components=15, random_state=0) nmf.fit(x_train) x_train_nmf = nmf.transform(x_train) x_test_nmf = nmf.transform(x_test) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(x_train_nmf, y_train) print(knn.score(x_test_nmf, y_test))
def retrain_nmf(): #this is a function which retrains periodically my nmf model #it should be trained on the latest user-ratings matrix available R = np.array(session.query(umr).all()).T #create a model and set the hyperparameters # model assumes R ~ PQ' model = NMF( n_components=10, init='random', random_state=10, ) model.fit(R) Q = model.components_ # movie-genre matrix P = model.transform(R) # user-genre matrix error = model.reconstruction_err_ #reconstruction error nR = np.dot(P, Q) #the reconstructed matrix #pickle my model list_pickle_path = os.path.dirname(os.path.abspath(__file__)) + '/nmf.pkl' nmf_pickle = open(list_pickle_path, 'wb') picklerick.dump(model, nmf_pickle) nmf_pickle.close() return
class NMFImpl: def __init__( self, n_components=None, init=None, solver="cd", beta_loss="frobenius", tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False, ): self._hyperparams = { "n_components": n_components, "init": init, "solver": solver, "beta_loss": beta_loss, "tol": tol, "max_iter": max_iter, "random_state": random_state, "alpha": alpha, "l1_ratio": l1_ratio, "verbose": verbose, "shuffle": shuffle, } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def get_NMF(fileData, normalized_axis=1, norm='l2'): data = np.load(fileData) n_components = np.size(np.unique(data['labels'])) features = data['data'] print features features = as_float_array(features) if normalized_axis != None: features = normalize(features, norm=norm, axis=normalized_axis) model = NMF(n_components=n_components) print n_components print model model.fit(np.transpose(features)) G = model.components_ print G labels_pred = np.zeros(features.shape[0]) for i in xrange(G.shape[1]): temp2 = np.argmax(G[:, i]) labels_pred[i] = temp2 labels_true = data['labels'] return labels_true, labels_pred, features
def __init__(self, HAll, NComp): HAll = np.vstack(HAll) # print(len(data_samples)) # print((tfidf.shape)) #Fit the NMF model nmf_model = NMF(n_components=NComp, random_state=1, alpha=.1, l1_ratio=.5) nmf = nmf_model.fit(HAll) self.U = nmf_model.transform(HAll) self.L = nmf_model.components_
def plot_nmf_illustration(): rnd = np.random.RandomState(5) X_ = rnd.normal(size=(300, 2)) # Add 8 to make sure every point lies in the positive part of the space X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8 nmf = NMF(random_state=0, n_components=2) nmf.fit(X_blob) X_nmf = nmf.transform(X_blob) fig, axes = plt.subplots(1, 2, figsize=(15, 5)) axes[0].set_xlim([0, 12]) axes[0].set_ylim([0, 12]) axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[0].set_xlabel("특성 1") axes[0].set_ylabel("특성 2") axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1, head_width=.3, color='k') axes[0].set_aspect('equal') axes[0].set_title("성분이 2개인 NMF") # second plot nmf = NMF(random_state=0, n_components=1) nmf.fit(X_blob) axes[1].set_xlim([0, 12]) axes[1].set_ylim([0, 12]) axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[1].set_xlabel("특성 1") axes[1].set_ylabel("특성 2") axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[1].set_aspect('equal') axes[1].set_title("성분이 1개인 NMF")
def nmf(DATA, nTOPICS, nWORDS): # ----- Topic Modelling using Non-negative Matrix Factorisation(NMF) # Instantiate Tfidf model tfidf = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english') # Create document timeframes matrix with tfidf model dtm = tfidf.fit_transform(DATA) # Instantiate NMF model nmf_model = NMF(n_components=nTOPICS) # Apply non-negative matrix factorisation on the document timeframes matrix nmf_model.fit(dtm) # nmf_model.transform() returns a matrix with coefficients that shows how much each document belongs to a topic topicResults = nmf_model.transform(dtm) # Store top nWords in dataFrame topics. These are the words thata describes the topic. topics = {} for i, t in enumerate(nmf_model.components_): # Negating an array causes the highest value to be the lowest value and vice versa topWordsIndex = (-t).argsort()[:nWORDS] topics[i] = [tfidf.get_feature_names()[i] for i in topWordsIndex] return topicResults, topics
def df_with_clean_text(data, number): df = pd.read_json(data, orient='split') df['clean_text'] = df['Text'].apply(lambda x: clean_text2(x)) df # use tfidf by removing tokens that don't appear in at least 50 documents vect = TfidfVectorizer(min_df=3, stop_words='english') X = vect.fit_transform(df.clean_text) # NMF model = NMF(n_components=number, random_state=5) # Fit the model to TF-IDF model.fit(X) components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names()) print(components_df) Topic = [] for i in range(number): Topic.append(i + 1) Topic components_df["Topic"] = Topic return components_df.to_json(date_format='iso', orient='split')
def grid_search_nmf_ncomponents(tfidf, folds, low, high): tfidf_dense = tfidf.toarray() mse_min = 99 mse_min_ncomponents = -1 for i in xrange(low, high + 1): print 'Fitting n_components = %d ...' %i mse_arr = [] for j in xrange(1, folds + 1): print 'Testing fold # %d' %j test_size = 1./folds A_train, A_test = tfidf_traintestsplit(tfidf, test_size=test_size) nmf_temp = NMF(n_components=i, random_state=1) nmf_temp.fit(A_train) W = nmf_temp.transform(A_train) H = nmf_temp.components_ tfidf_pred = np.dot(W, H) mse_fold = mean_squared_error(A_test.toarray(), tfidf_pred) mse_arr.append(mse_fold) mse_temp = np.mean(mse_arr) y_mse_tts.append((i, mse_temp)) if mse_temp < mse_min: mse_min = mse_temp mse_min_ncomponents = i # cv = cross_val_score(nmf_temp, tfidf, scoring='mean_squared_error', cv=5) # nmf_temp.fit(tfidf) # W = nmf_temp.transform(tfidf) # H = nmf_temp.components_ # tfidf_pred = np.dot(W, H) # mse_temp = mean_squared_error(tfidf_dense, tfidf_pred) # y_mse.append(mse_temp) # x_range.append(i) print 'MSE of n_components = %d: %.10f' %(i, mse_temp) print '-------------------------------' # if mse_temp < mse_min: # mse_min = mse_temp # mse_min_ncomponents = i return mse_min_ncomponents
def grid_search_nmf_ncomponents(tfidf, folds, low, high): tfidf_dense = tfidf.toarray() mse_min = 99 mse_min_ncomponents = -1 for i in xrange(low, high + 1): print 'Fitting n_components = %d ...' % i mse_arr = [] for j in xrange(1, folds + 1): print 'Testing fold # %d' % j test_size = 1. / folds A_train, A_test = tfidf_traintestsplit(tfidf, test_size=test_size) nmf_temp = NMF(n_components=i, random_state=1) nmf_temp.fit(A_train) W = nmf_temp.transform(A_train) H = nmf_temp.components_ tfidf_pred = np.dot(W, H) mse_fold = mean_squared_error(A_test.toarray(), tfidf_pred) mse_arr.append(mse_fold) mse_temp = np.mean(mse_arr) y_mse_tts.append((i, mse_temp)) if mse_temp < mse_min: mse_min = mse_temp mse_min_ncomponents = i # cv = cross_val_score(nmf_temp, tfidf, scoring='mean_squared_error', cv=5) # nmf_temp.fit(tfidf) # W = nmf_temp.transform(tfidf) # H = nmf_temp.components_ # tfidf_pred = np.dot(W, H) # mse_temp = mean_squared_error(tfidf_dense, tfidf_pred) # y_mse.append(mse_temp) # x_range.append(i) print 'MSE of n_components = %d: %.10f' % (i, mse_temp) print '-------------------------------' # if mse_temp < mse_min: # mse_min = mse_temp # mse_min_ncomponents = i return mse_min_ncomponents
def Quick_nmf(self, k = 5, top = 10, tfidf = None, print_tops = True, stop_words =[]): text = self.text labels = self.beer_names if tfidf == None: stopwords = set(list(ENGLISH_STOP_WORDS) + stop_words) tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=.8, min_df=.2, stop_words = stopwords, max_features = 10000, ) X = tfidf.fit_transform(text) bag = np.array(tfidf.get_feature_names()) self.bag_of_words = bag nmf = NMF(n_components = k) # nmf = TruncatedSVD(n_components = k) nmf.fit(X) W = nmf.transform(X) #len(beers),k H = nmf.components_ #k,len(beers) all_words = [] for group in range(k): #idx of the top ten words for each group i_words = np.argsort(H[group,:])[::-1][:top] words = bag[i_words] all_words.append(words) i_label = np.argsort(W[:,group])[::-1][:top] if print_tops: print('-'*10) print('Group:',group) print('WORDS') for word in words: print('-->',word) print('LABELS') for i in i_label: print('==>',labels[i]) return W,H,nmf,tfidf,all_words
def fit(self, num_factors=10, l1_ratio = 0.5, solver = "multiplicative_update", init_type = "random", beta_loss = "frobenius"): print('|{}| training |'.format(self.NAME)) assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio) if solver not in self.SOLVER_VALUES: raise ValueError("Value for 'solver' not recognized. Acceptable values are {}, provided was '{}'".format(self.SOLVER_VALUES.keys(), solver)) if init_type not in self.INIT_VALUES: raise ValueError("Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'".format(self.INIT_VALUES, init_type)) if beta_loss not in self.BETA_LOSS_VALUES: raise ValueError("Value for 'beta_loss' not recognized. Acceptable values are {}, provided was '{}'".format(self.BETA_LOSS_VALUES, beta_loss)) print("|{}| Computing NMF decomposition ... |".format(self.NAME)) nmf_solver = NMF(n_components = num_factors, init = init_type, solver = self.SOLVER_VALUES[solver], beta_loss = beta_loss, random_state = None, l1_ratio = l1_ratio, shuffle = True, verbose=True, max_iter = 500) nmf_solver.fit(self.URM_train) self.ITEM_factors = nmf_solver.components_.copy().T self.USER_factors = nmf_solver.transform(self.URM_train) self.r_hat = self.USER_factors.dot(self.ITEM_factors) print("|{}| Done |".format(self.NAME))
def run_nmf(X, vectorizer, n_topics=4, print_top_words=False): ''' INPUT: Vectorized word array, vectorizer object, number of latent features to uncover, whether to print the top words from each latent feature OUTPUT: Saves pickled NMF model, returns latent weights matrix that can be concatenated with our dataset as additional features ''' nmf = NMF(n_components=n_topics) nmf.fit(X) cPickle.dump(nmf, open('models/nmf.pkl', 'wb')) H = nmf.transform(X) if print_top_words==True: feature_names = vectorizer.get_feature_names() n_top_words = 10 for topic_idx, topic in enumerate(nmf.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() return H
def setup_nmf(all_ratings=None, engine=None, number_of_genres = 10): '''''' print('Start loading NMF') # Create a sparse matrix of user, movie ids and their ratings (User Movie Ratings UMR) user_movie_ratings = pd.pivot_table(all_ratings, values='rating', index='userId', columns='movieId') # Fill sparse matrix' NaNs with 0 to make it dense user_movie_id_ratings_matrix = user_movie_ratings.fillna(0) # Create and fit a NMF model number_of_genres = number_of_genres m = NMF(n_components=number_of_genres) m.fit(user_movie_id_ratings_matrix) # Create a movie-genre matrix # Q: movie-matrix. Each genre (row) has a coefficient for each movie (columns). Number of genres is set by the NMF hyperparameter n_components=2. Q is the film submatrix. genre_movie_matrix = m.components_ # P: Create a user submatrix # P = m.transform(user_movie_id_ratings_matrix) print('Done loading NMF') return m, genre_movie_matrix, user_movie_id_ratings_matrix
def ncimages(images, n_components=10, n_eigen=10): """Non-negtive images apply NMF Arguments: images {List[Image]} -- list of images Keyword Arguments: n_components {number} -- number of components (default: {10}) n_eigen {number} -- number of eigen images (default: {10}) """ size = images[0].size mode = images[0].mode data = PIL_ext.tomatrix(images, 'col') nmf = NMF(n_components=n_components) nmf.fit(data) eigens = nmf.transform(data) eigens *= 256 return [ PIL_ext.toimage(eigens[:, i], size, mode=mode) for i in range(n_eigen) ]
def extract(self, block): from numpy import clip, inf, percentile, asarray, where, size, prod from sklearn.decomposition import NMF from skimage.measure import label from skimage.morphology import remove_small_objects # get dimensions n = self.componentsPerBlock dims = block.shape[1:] # handle maximum size if self.maxArea == "block": maxArea = prod(dims) / 2 else: maxArea = self.maxArea # reshape to be t x all spatial dimensions data = block.reshape(block.shape[0], -1) # build and apply NMF model to block model = NMF(n, max_iter=self.maxIter) model.fit(clip(data, 0, inf)) # reconstruct sources as spatial objects in one array comps = model.components_.reshape((n, ) + dims) # convert from basis functions into shape # by finding connected components and removing small objects combined = [] for c in comps: tmp = c > percentile(c, self.percentile) shape = remove_small_objects(label(tmp), min_size=self.minArea) coords = asarray(where(shape)).T if (size(coords) > 0) and (size(coords) < maxArea): combined.append(Source(coords)) return combined
def var_embedding(a, b, df=None, n_components=2, ret_b_emb=False, log_cts=False): """ Get embeddings of a wrt b. """ if df is None: df = conf.df sdf = wcooc.pairs_to_cooc_sparse_df(df[[a, b]].itertuples(index=False, name=None)) if log_cts: sdf = np.log10(sdf + 1) X = sdf.sparse.to_coo() nmf = NMF(n_components=n_components) nmf.fit(X) emb_a = pd.DataFrame( nmf.components_.T, index=sdf.columns, columns=[f"n{i}" for i in range(1, n_components + 1)], ) emb_a.index.name = a emb_a.columns.name = b if ret_b_emb: emb_b = pd.DataFrame( nmf.fit_transform(X), index=sdf.index, columns=[f"n{i}" for i in range(1, n_components + 1)], ) emb_b.index.name = b emb_b.columns.name = a return emb_a, emb_b return emb_a
def my_cross_val_score(CZ, n_comp=3, model='pca', k=5, random_state=False, shuffle=True): if shuffle == True: save = np.copy(CZ) np.random.shuffle(CZ) kf = KFold(n_splits=k) cv = [] pca = decomposition.PCA(n_components=n_comp, random_state=random_state) nmf = NMF(n_components=n_comp, random_state=random_state) kmeans = KMeans(n_clusters=n_comp, random_state=random_state) ksvd = ApproximateKSVD(n_components=n_comp) for train, test in kf.split(CZ): cz_test = CZ[test] cz_train = CZ[train] if model == 'pca': pca.fit(cz_train) CZ_reconstructed = pca.inverse_transform(pca.transform(cz_test)) elif model == 'nmf': nmf.fit(cz_train) CZ_reconstructed = np.dot(nmf.transform(cz_test), nmf.components_) elif model == 'k_means': kmeans.fit(cz_train) CZ_reconstructed = kmeans.cluster_centers_[kmeans.predict(cz_test)] elif model == 'k_svd': meantr = np.mean(cz_train, axis=1)[:, np.newaxis] meantest = np.mean(cz_test, axis=1)[:, np.newaxis] dictionary = ksvd.fit(cz_train - meantr).components_ gamma = ksvd.transform(cz_test - meantest) CZ_reconstructed = gamma.dot(dictionary) + meantest cv.append(mean_squared_error(CZ_reconstructed, cz_test)) if shuffle == True: CZ = save return cv
class NMFImpl(): def __init__(self, n_components=None, init=None, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False): self._hyperparams = { 'n_components': n_components, 'init': init, 'solver': solver, 'beta_loss': beta_loss, 'tol': tol, 'max_iter': max_iter, 'random_state': random_state, 'alpha': alpha, 'l1_ratio': l1_ratio, 'verbose': verbose, 'shuffle': shuffle } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def zero_nmf_model(ratings): # Create a sparse matrix of all user ratings for all movies reviews = pd.pivot_table(ratings, 'rating', 'userid', 'movieid') reviews = reviews.fillna(0) # instantiate the NMF model model = NMF(n_components=42, init='random', random_state=42) model.fit(reviews) # Note: R matrix is reviews Q = model.components_ # movie-feature matrix P = model.transform(reviews) # user-feature matrix # dot product of P and Q is our Rhat (Rpredictions) Rhat = np.dot(P, Q) # 610 movies, 9724 users # Rhat.shape # only useful to compare against other models # model.reconstruction_err_ return Rhat, model
def run_nmf(X, vectorizer, n_topics=4, print_top_words=False): ''' INPUT: Vectorized word array, vectorizer object, number of latent features to uncover, whether to print the top words from each latent feature OUTPUT: Saves pickled NMF model, returns latent weights matrix that can be concatenated with our dataset as additional features ''' nmf = NMF(n_components=n_topics) nmf.fit(X) cPickle.dump(nmf, open('../models/nmf.pkl', 'wb')) H = nmf.transform(X) if print_top_words == True: feature_names = vectorizer.get_feature_names() n_top_words = 15 for topic_idx, topic in enumerate(nmf.components_): print("Topic #%d:" % topic_idx) print(" ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ])) print() return H
def decomp(mix_met, component=0, method=2, comps=3): if method == 0: nmf = NMF(n_components=8, init='nndsvd') # , random_state=0) elif method == 1: nmf = FastICA(n_components=comps, random_state=0) nmf_1 = nmf.fit_transform(mix_met) A = nmf.mixing_ A1 = A[:, 0].reshape(len(A[:, 0]), 1) A2 = A[:, 1].reshape(len(A[:, 0]), 1) if comps > 2: A3 = A[:, 2].reshape(len(A[:, 0]), 1) else: nmf = PCA(n_components=comps) nmf.fit(mix_met) A = nmf.components_ A1 = A[0, :].reshape(len(A[0, :]), 1) A2 = A[1, :].reshape(len(A[1, :]), 1) A3 = A[2, :].reshape(len(A[2, :]), 1) nmf_1 = nmf.fit_transform(mix_met) sygnal = nmf_1[:, component] frst_comp = np.dot(nmf_1[:, 0].reshape(len(nmf_1), 1), A1.T) scnd_comp = np.dot(nmf_1[:, 1].reshape(len(nmf_1), 1), A2.T) if comps > 2: thrd_comp = np.dot(nmf_1[:, 2].reshape(len(nmf_1), 1), A3.T) if comps > 2 and len(nmf_1[:, 0]) < len(A1): return sygnal, np.array([frst_comp, scnd_comp, thrd_comp]), np.array( [nmf_1[:, 0], nmf_1[:, 1], nmf_1[:, 2]]), np.array([A1, A2, A3]) elif comps > 2 and len(nmf_1[:, 0]) > len(A1): return sygnal, np.array([frst_comp, scnd_comp, thrd_comp]), np.array( [A1, A2, A3]), np.array([nmf_1[:, 0], nmf_1[:, 1], nmf_1[:, 2]]) elif comps == 2 and len(nmf_1[:, 0]) > len(A1): return sygnal, np.array([frst_comp, scnd_comp]), np.array( [A1, A2]), np.array([nmf_1[:, 0], nmf_1[:, 1]]) else: return sygnal, np.array([frst_comp, scnd_comp ]), np.array([nmf_1[:, 0], nmf_1[:, 1]]), np.array([A1, A2])
def nonnegative_matrix_factorization(parameters, X_train, X_val, y_train, characterize=False): vec = StemmedTfidfVectorizer(**filter_parameters(parameters, 'vec')) vec.fit(X_train, y_train) X_train = vec.transform(X_train) X_val = vec.transform(X_val) nmf = NMF(random_state=0, **filter_parameters(parameters, 'nmf')) nmf.fit(X_train, y_train) X_train = nmf.transform(X_train) X_val = nmf.transform(X_val) if characterize: characterize_topics(nmf, vec.get_feature_names()) model = parameters['model'](**filter_parameters(parameters, 'clf')) model.fit(X_train, y_train) y_pred = model.predict(X_val) return model, y_pred