def featureSelectAndParamTuning(nmf_components): print "trying with NMF n_components=", 50 * nmf_components model = NMF(n_components=50 * nmf_components, init='random', random_state=0, verbose=0) boost_input = model.fit_transform(data, y=y) #print "Finished NMF" print "Shape of NMF outPut is:", boost_input.shape X_train, X_test, y_train, y_test = train_test_split(boost_input, y, test_size=0.2, random_state=42) model = XGBClassifier(silent=True, seed=42) param_grid = { #'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], #'scale_pos_weight':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], #'colsample_bytree':[0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], 'colsample_bylevel': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] } start_time = timeit.default_timer() kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=kfold) grid_result = grid_search.fit(X_train, y_train) # Get the estimator print grid_result.best_estimator_ print 'CV Accuracy of best parameters: %3f' % grid_result.best_score_ model = grid_result.best_estimator_ y_pred = model.predict(X_test) print "Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score( y_test, y_pred) #accuracy_score(y_test, y_pred) elapsed = timeit.default_timer() - start_time print elapsed
tf_idf = tf_idf_vectorizor.fit_transform(tf_data) tf_idf_norm = normalize(tf_idf) tf_idf_array = tf_idf_norm.toarray() pd.DataFrame(tf_idf_array, columns=tf_idf_vectorizor.get_feature_names()).head() KM = KMeans(4) NNMF = NMF(4) fittedKM = KM.fit(tf_idf_array) fittedNNMF = NNMF.fit(tf_idf_array) featuresNNMF = NNMF.transform(tf_idf_array) print(featuresNNMF) predictionKM = KM.predict(tf_idf_array) predictionNNMF = NNMF.predict(tf_idf_array) plt.scatter(tf_idf_array[:, 2], tf_idf_array[:, 3], c=predictionKM, s=50, cmap='viridis') centers2 = fitted.cluster_centers_ plt.scatter(centers2[:, 0], centers2[:, 1], c='black', s=300, alpha=0.6) number_clusters = range(1, 7) kmeans = [KMeans(n_clusters=i, max_iter=600) for i in number_clusters] kmeans
print('val acc: ', hist.history['val_accuracy'][-1]) # %matplotlib inline import matplotlib.pyplot as plt plt.plot(hist.history['val_loss']) plt.xlabel('epoch') plt.ylabel('loss') plt.show() """- 모델 평가(Test Model)""" test_loss = model.evaluate([test_df.userId, test_df.movieId], test_df.rating) print('test loss: ', test_loss) """- ## 학습된 머신을 활용한 예측""" pd.options.display.float_format = '{:.2f}'.format # 출력 포매팅 설정 ratings_df[(ratings_df['userId'] == 249) & (ratings_df['movieId'] == 70)] movies_df['movieId'].head(575) ratings_df.loc[7000] userId = 31 # 1 ~ 610 movieId = 165 # 1 ~ 193609 # sparse하고 ratings_df와 모두 대응되지도 않음 movie_title = list(movies_df[movies_df['movieId'] == movieId].title)[0] user_v = np.expand_dims(userid2idx[userId], 0) movie_v = np.expand_dims(movieid2idx[movieId], 0) predict = model.predict([user_v, movie_v]) print('영화 {} 에 대한 사용자 ID {}님의 예상 별점은 {:.1f} 입니다.'.format( movie_title, userId, predict[0][0]))
class mlexplorer: """use machine learning algorithms from scikit learn to explore spectroscopic datasets Performs automatic scaling and train/test split before NMF or PCA fit. Attributes ---------- x : {array-like, sparse matrix}, shape = (n_samples, n_features) Spectra; n_features = n_frequencies. X_test : {array-like, sparse matrix}, shape = (n_samples, n_features) spectra organised in rows (1 row = one spectrum) that you want to use as a testing dataset. THose spectra should not be present in the x (training) dataset. The spectra should share a common X axis. algorithm : String, "PCA", "NMF", default = "PCA" scaling : Bool True or False. If True, data will be scaled prior to fitting (see below), scaler : String the type of scaling performed. Choose between MinMaxScaler or StandardScaler, see http://scikit-learn.org/stable/modules/preprocessing.html for details. Default = "MinMaxScaler". test_size : float the fraction of the dataset to use as a testing dataset; only used if X_test and y_test are not provided. rand_state : Float64 the random seed that is used for reproductibility of the results. Default = 42. model : Scikit learn model A Scikit Learn object model, see scikit learn library documentation. Remarks ------- For details on hyperparameters of each algorithms, please directly consult the documentation of SciKit Learn at: http://scikit-learn.org/stable/ Results for machine learning algorithms can vary from run to run. A way to solve that is to fix the random_state. Example ------- Given an array X of n samples by m frequencies, and Y an array of n x 1 concentrations >>> explo = rampy.mlexplorer(X) # X is an array of signals built by mixing two partial components >>> explo.algorithm = 'NMF' # using Non-Negative Matrix factorization >>> explo.nb_compo = 2 # number of components to use >>> explo.test_size = 0.3 # size of test set >>> explo.scaler = "MinMax" # scaler >>> explo.fit() # fitting! >>> W = explo.model.transform(explo.X_train_sc) # getting the mixture array >>> H = explo.X_scaler.inverse_transform(explo.model.components_) # components in the original space >>> plt.plot(X,H.T) # plot the two components """ def __init__(self, x, **kwargs): """ Parameters ---------- x : array{Float64} the spectra organised in rows (1 row = one spectrum). The spectra should share a common X axis. """ self.x = x # # Kwargs extractions # self.X_test = kwargs.get("X_test", [0.0]) self.algorithm = kwargs.get("algorithm", "PCA") self.test_size = kwargs.get("test_size", 0.3) self.scaling = kwargs.get("scaling", True) self.scaler = kwargs.get("scaler", "MinMaxScaler") self.rand_state = kwargs.get("rand_state", 42) self.nb_compo = kwargs.get("n_components", 2) if len(self.X_test) == 1: self.X_train, self.X_test = sklearn.model_selection.train_test_split( self.x, test_size=self.test_size, random_state=self.rand_state) elif self.X_test.shape[1] == self.x.shape[1]: self.X_train = np.copy(self.x) else: ValueError( "You tried to provide a testing dataset that has a different number of features (in columns) than the training set. Please correct this." ) # initialising the preprocessor scaler if self.scaler == "StandardScaler": self.X_scaler = sklearn.preprocessing.StandardScaler() elif self.scaler == "MinMaxScaler": self.X_scaler = sklearn.preprocessing.MinMaxScaler() else: InputError( "Choose the scaler between MinMaxScaler and StandardScaler") # fitting scaler self.X_scaler.fit(self.X_train) # scaling the data in all cases, it may not be used during the fit later self.X_train_sc = self.X_scaler.transform(self.X_train) self.X_test_sc = self.X_scaler.transform(self.X_test) def fit(self): """Train the model with the indicated algorithm. Do not forget to tune the hyperparameters. """ if self.algorithm == "PCA": self.model = PCA(n_components=self.nb_compo) elif self.algorithm == "NMF": self.model = NMF(n_components=self.nb_compo, init="nndsvd") if self.scaling == True: self.model.fit(self.X_train_sc) else: self.model.fit(self.X_train) def refit(self): """Train the model with the indicated algorithm. Do not forget to tune the hyperparameters. """ if self.scaling == True: self.model.fit(self.X_train_sc) else: self.model.fit(self.X_train) def predict(self, X): """Predict using the model. Parameters ---------- X : {array-like, sparse matrix}, shape = (n_samples, n_features) Samples. Returns ------- C : array, shape = (n_samples,) Returns predicted values. Remark ------ if self.scaling == "yes", scaling will be performed on the input X. """ if self.scaling == True: X_sc = self.X_scaler.transform(X) pred_sc = self.model.predict(X_sc) return self.Y_scaler.inverse_transform(pred_sc.reshape(-1, 1)) else: return self.model.predict(self.X)
class Topicmodel(): ''' Wrapper class for different topic models ''' def __init__(self,folder='model',modeltype='kmeans',topics=100,topwords=10): # the classifier, which also contains the trained BoW transformer self.bow = cPickle.load(open(folder+'/BoW_transformer.pickle')) self.folder = folder self.modeltype = modeltype self.topics = topics self.topwords = topwords if self.modeltype is 'kmeans': from sklearn.cluster import KMeans self.model = KMeans(n_clusters=topics,n_init=50) if self.modeltype is 'kpcakmeans': from sklearn.cluster import KMeans from sklearn.decomposition import KernelPCA self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\ 'kmeans':KMeans(n_clusters=topics,n_init=50)} if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self,X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow['tfidf_transformer'].fit_transform(\ self.bow['count_vectorizer'].fit_transform(X)) # transform word to BoW index into reverse lookup table words = self.bow['count_vectorizer'].vocabulary_.values() wordidx = self.bow['count_vectorizer'].vocabulary_.keys() self.idx2word = dict(zip(words,wordidx)) # depending on the model, train if self.modeltype is 'kmeans': Xc = self.model.fit_predict(X) if self.modeltype is 'kpcakmeans': Xc = self.model['kpca'].fit_transform(X) Xc = self.model['kmeans'].fit_predict(Xc) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X).argmax(axis=0) # for each cluster/topic compute covariance of word with cluster label # this measure is indicative of the importance of the word for the topic ass = zeros(self.topics) self.topicstats = [] for cluster in range(self.topics): # this is a binary vector, true if a data point was in this cluster y = double(Xc==cluster) # this is the covariance of the data with the cluster label Xcov = X.T.dot(y) # find the most strongly covarying (with the cluster label) words wordidx = reversed(Xcov.argsort()[-self.topwords:]) topicwords = dict([(self.idx2word[idx],Xcov[idx]) for idx in wordidx]) self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\ 'words': topicwords}) print 'Topic %d: %3d Assignments '%(cluster,y.sum())\ + 'Topwords: ' + ' '.join(topicwords.keys()[:10]) datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") fn = self.folder+'/topicmodel-%s-'%self.modeltype +datestr+'.json' print "Saving model stats to "+fn open(fn,'wb').write(json.dumps(self.topicstats)) def predict(self,X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow['tfidf_transformer'].transform(\ self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kmeans': return self.model.predict(X) if self.modeltype is 'kpcakmeans': return self.model['kmeans'].predict(self.model['kpca'].transform(X)) if self.modeltype is 'nmf': return self.model.transform(X).argmax(axis=0)
class mlexplorer: """use machine learning algorithms from scikit learn to explore spectroscopic datasets Performs automatic scaling and train/test split before NMF or PCA fit. Attributes ---------- x : {array-like, sparse matrix}, shape = (n_samples, n_features) Spectra; n_features = n_frequencies. X_test : {array-like, sparse matrix}, shape = (n_samples, n_features) spectra organised in rows (1 row = one spectrum) that you want to use as a testing dataset. THose spectra should not be present in the x (training) dataset. The spectra should share a common X axis. algorithm : String, "PCA", "NMF", default = "PCA" scaling : Bool True or False. If True, data will be scaled prior to fitting (see below), scaler : String the type of scaling performed. Choose between MinMaxScaler or StandardScaler, see http://scikit-learn.org/stable/modules/preprocessing.html for details. Default = "MinMaxScaler". test_size : float the fraction of the dataset to use as a testing dataset; only used if X_test and y_test are not provided. rand_state : Float64 the random seed that is used for reproductibility of the results. Default = 42. model : Scikit learn model A Scikit Learn object model, see scikit learn library documentation. Remarks ------- For details on hyperparameters of each algorithms, please directly consult the documentation of SciKit Learn at: http://scikit-learn.org/stable/ Results for machine learning algorithms can vary from run to run. A way to solve that is to fix the random_state. Example ------- Given an array X of n samples by m frequencies, and Y an array of n x 1 concentrations >>> explo = rampy.mlexplorer(X) # X is an array of signals built by mixing two partial components >>> explo.algorithm = 'NMF' # using Non-Negative Matrix factorization >>> explo.nb_compo = 2 # number of components to use >>> explo.test_size = 0.3 # size of test set >>> explo.scaler = "MinMax" # scaler >>> explo.fit() # fitting! >>> W = explo.model.transform(explo.X_train_sc) # getting the mixture array >>> H = explo.X_scaler.inverse_transform(explo.model.components_) # components in the original space >>> plt.plot(X,H.T) # plot the two components """ def __init__(self,x,**kwargs): """ Parameters ---------- x : array{Float64} the spectra organised in rows (1 row = one spectrum). The spectra should share a common X axis. """ self.x = x # # Kwargs extractions # self.X_test = kwargs.get("X_test",[0.0]) self.algorithm = kwargs.get("algorithm","PCA") self.test_size = kwargs.get("test_size",0.3) self.scaling = kwargs.get("scaling",True) self.scaler = kwargs.get("scaler","MinMaxScaler") self.rand_state = kwargs.get("rand_state",42) self.nb_compo = kwargs.get("n_components",2) if len(self.X_test) == 1: self.X_train, self.X_test = sklearn.model_selection.train_test_split( self.x, test_size=self.test_size, random_state=self.rand_state) elif self.X_test.shape[1] == self.x.shape[1]: self.X_train = np.copy(self.x) else: ValueError("You tried to provide a testing dataset that has a different number of features (in columns) than the training set. Please correct this.") # initialising the preprocessor scaler if self.scaler == "StandardScaler": self.X_scaler = sklearn.preprocessing.StandardScaler() elif self.scaler == "MinMaxScaler": self.X_scaler = sklearn.preprocessing.MinMaxScaler() else: InputError("Choose the scaler between MinMaxScaler and StandardScaler") # fitting scaler self.X_scaler.fit(self.X_train) # scaling the data in all cases, it may not be used during the fit later self.X_train_sc = self.X_scaler.transform(self.X_train) self.X_test_sc = self.X_scaler.transform(self.X_test) def fit(self): """Train the model with the indicated algorithm. Do not forget to tune the hyperparameters. """ if self.algorithm == "PCA": self.model = PCA(n_components=self.nb_compo) elif self.algorithm == "NMF": self.model = NMF(n_components=self.nb_compo,init = "nndsvd") if self.scaling == True: self.model.fit(self.X_train_sc) else: self.model.fit(self.X_train) def refit(self): """Train the model with the indicated algorithm. Do not forget to tune the hyperparameters. """ if self.scaling == True: self.model.fit(self.X_train_sc) else: self.model.fit(self.X_train) def predict(self,X): """Predict using the model. Parameters ---------- X : {array-like, sparse matrix}, shape = (n_samples, n_features) Samples. Returns ------- C : array, shape = (n_samples,) Returns predicted values. Remark ------ if self.scaling == "yes", scaling will be performed on the input X. """ if self.scaling == True: X_sc = self.X_scaler.transform(X) pred_sc = self.model.predict(X_sc) return self.Y_scaler.inverse_transform(pred_sc.reshape(-1,1)) else: return self.model.predict(self.X)
color = 'm->' W = data_column sz = W.shape train = W[:int(sz[0] * 0.2)] test = W[int(sz[0] * 0.2):] y = class_column[:int(sz[0] * 0.2)] X = train test_Y = class_column[int(sz[0] * 0.2):] test_X = test model = model.fit(X, y) y_hat = model.predict(test_X) scores = cross_val_score(model, X, y, cv=10) dim = np.arange(len(scores)) ax.plot(scores, color, mfc='none') #ax.tight_layout() #print ("Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(test_Y, y_hat)) #print ("Hamming loss: ", hamming_loss(test_Y, y_hat)) #print ("Average precision score:", precision_score(test_Y, y_hat, average='macro')) #print ("Confusion matrix:\n", confusion_matrix(test_Y, y_hat)) ax.legend(['LR', 'LDA', 'NB', 'MLP', 'SVM']) ax.set_xlabel('Iteration') ax.set_ylabel('Precision') ax.set_xticks(dim)
class Topicmodel(): ''' Wrapper class for different topic models ''' def __init__(self, folder='model', modeltype='kmeans', topics=100, topwords=10): # the classifier, which also contains the trained BoW transformer self.bow = cPickle.load(open(folder + '/BoW_transformer.pickle')) self.folder = folder self.modeltype = modeltype self.topics = topics self.topwords = topwords if self.modeltype is 'kmeans': from sklearn.cluster import KMeans self.model = KMeans(n_clusters=topics, n_init=50) if self.modeltype is 'kpcakmeans': from sklearn.cluster import KMeans from sklearn.decomposition import KernelPCA self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\ 'kmeans':KMeans(n_clusters=topics,n_init=50)} if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self, X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow['tfidf_transformer'].fit_transform(\ self.bow['count_vectorizer'].fit_transform(X)) # transform word to BoW index into reverse lookup table words = self.bow['count_vectorizer'].vocabulary_.values() wordidx = self.bow['count_vectorizer'].vocabulary_.keys() self.idx2word = dict(zip(words, wordidx)) # depending on the model, train if self.modeltype is 'kmeans': Xc = self.model.fit_predict(X) if self.modeltype is 'kpcakmeans': Xc = self.model['kpca'].fit_transform(X) Xc = self.model['kmeans'].fit_predict(Xc) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X).argmax(axis=0) # for each cluster/topic compute covariance of word with cluster label # this measure is indicative of the importance of the word for the topic ass = zeros(self.topics) self.topicstats = [] for cluster in range(self.topics): # this is a binary vector, true if a data point was in this cluster y = double(Xc == cluster) # this is the covariance of the data with the cluster label Xcov = X.T.dot(y) # find the most strongly covarying (with the cluster label) words wordidx = reversed(Xcov.argsort()[-self.topwords:]) topicwords = dict([(self.idx2word[idx], Xcov[idx]) for idx in wordidx]) self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\ 'words': topicwords}) print 'Topic %d: %3d Assignments '%(cluster,y.sum())\ + 'Topwords: ' + ' '.join(topicwords.keys()[:10]) datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") fn = self.folder + '/topicmodel-%s-' % self.modeltype + datestr + '.json' print "Saving model stats to " + fn open(fn, 'wb').write(json.dumps(self.topicstats)) def predict(self, X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow['tfidf_transformer'].transform(\ self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kmeans': return self.model.predict(X) if self.modeltype is 'kpcakmeans': return self.model['kmeans'].predict( self.model['kpca'].transform(X)) if self.modeltype is 'nmf': return self.model.transform(X).argmax(axis=0)