def main(): # 0. read data and splite it to 80% for training and 20% for testing items = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1') print items.shape items_train, items_test = train_test_split(items, train_size=0.8, random_state=0) print items_train.shape, items_test.shape # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result if not os.path.isfile('model/tfidf_model.pickle'): print('traning tf-idf model ...') tfidf_model = TfidfVectorizer(norm='l2',min_df=0, use_idf=True,max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) item_feature_matrix = tfidf_model.fit_transform(items_train['movie desription'].values.astype('U')) print('#1. dimension of the item-feature matrix', item_feature_matrix.shape) # 1.1 saving tf-idf model print('Saving tf-idf model ...') save_model('model/tfidf_model.pickle', tfidf_model) if not os.path.isfile('result/item_feature_matrix.pickle'): # 1.2. saving tf-idf matrix result print('Saving tf-idf matrix result ...') save_model('result/item_feature_matrix.pickle', item_feature_matrix) # 2. train dbn model and save the model into model/dbn.pickle # 2.1. load tf-idf result print('loading item feature matrix ...') item_feature_matrix = load_model('result/item_feature_matrix.pickle') if not os.path.isfile('model/dbn-model.pkl'): dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') # 2.2. fit dbn model dbn.fit(item_feature_matrix.A) # 2.3. save dbn model print('saving DBN model ...') dbn.save('model/dbn-model.pkl') print('Loadin DBN model') dbn = UnsupervisedDBN.load('model/dbn-model.pkl') # 3. Clustering with k-mens and save model and results if not os.path.isfile('model/kmeans-model.pkl'): kmeans = KMeans(n_clusters=5, random_state=0).fit(dbn.transform(item_feature_matrix.A)) print('saving k-means model ...') save_model('model/kmeans-model.pkl', kmeans) else: kmeans = load_model('model/kmeans-model.pkl') print(kmeans.labels_)
print('X_test size : {0} \n'.format(X_test.shape)) # Models we will use dbn = UnsupervisedDBN(hidden_layers_structure=[256, 512], batch_size=32, learning_rate_rbm=0.06, learning_rate_backprop=1e-3, n_epochs_rbm=50, n_epochs_fine_tune=500, activation_function='sigmoid', contrastive_divergence_iter=1) ############################################################################### # Training RBM-Logistic Pipeline dbn.fit(X_train) # Save the training metrics for layer_wise_error, index in zip(dbn.layer_wise_error, range(len(dbn.layer_wise_error))): with io.open("layer_" + str(index), 'wb') as f: pickle.dump(layer_wise_error, f) # Fine tune the DBN using the reconstruction MSE (over pixels) recon_error_test, recon_error_train = dbn.fine_tune(X_train, X_test) # Save fine tuned parameters with io.open("test_recon_finetune", 'wb') as f: pickle.dump(recon_error_test, f) with io.open("train_recon_finetune", 'wb') as f: pickle.dump(recon_error_train, f)
class DBN_AE: def __init__(self, n, max_autoencoder_size=10, FM_grace_period=None, AD_grace_period=10000, dbn_layers=[100, 75, 50, 35, 16], learning_rate=0.1, hidden_ratio=0.75, feature_map=None): self.AD_grace_period = AD_grace_period self.FM_grace_period = FM_grace_period self.lr = learning_rate self.hr = hidden_ratio self.n = n self.n_trained = 0 # the number of training instances so far self.n_executed = 0 # the number of executed instances so far self.dbn_batch = 10000 self.dbn_layers = dbn_layers self.AE_dim = dbn_layers[-1] self.__createDBN__() self.__createAE__() self.fvs = [] print("Deep Belief Network: train-mode, Auto-Encoder: off-mode") def __createAE__(self): params = AE.dA_params(self.AE_dim, n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr) self.AE = AE.dA(params) def __createDBN__(self): self.FM = UnsupervisedDBN(hidden_layers_structure=self.dbn_layers, batch_size=512, learning_rate_rbm=0.3, n_epochs_rbm=64, activation_function='sigmoid', verbose=False) def process(self, x): if self.n_trained < self.FM_grace_period + self.AD_grace_period: self.train(x) return 0.0 else: return self.execute(x) def train_FM(self, x): self.fvs.append(x) if len(self.fvs) == self.dbn_batch: xx = np.array(self.fvs) self.FM.fit(xx) self.fvs.clear() def train(self, x): if self.n_trained < self.FM_grace_period: self.train_FM(x) else: S_l1 = self.FM.transform(x) self.AE.train(S_l1) self.n_trained += 1 if self.n_trained == self.AD_grace_period + self.FM_grace_period: print( "Deep Belief Network: execute-mode, Auto-Encoder: train-mode") def execute(self, x): self.n_executed += 1 S_l1 = self.FM.transform(x) return self.AE.execute(S_l1)
header=None, encoding='ISO-8859-1') u_item_DF['movie desription'] = [val[2] for i, val in data_new.iterrows()] sklearn_tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) item_feature_matrix = sklearn_tfidf.fit_transform( u_item_DF['movie desription'].values.astype('U')) print('dimension of the item-feature matrix', item_feature_matrix.shape) # Train DBN model from dbn.models import UnsupervisedDBN #[4604, 2000, 4000, 3000, 1000] dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') dbn.fit(item_feature_matrix.A) # Save the model print('Saving Model ...') dbn.save('model-1.pkl') print('Model Saved')
def main(tfidfModel=None, tfidfMatrix=None, dbn_model=None, kmeans_model=None): # 0. read data and splite it to 80% for training and 20% for testing items_info = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1') u_base1 = pd.read_csv('input/u1.base', sep='\t', header=None) train = pd.DataFrame(u_base1[1].drop_duplicates()) u_test1 = pd.read_csv('input/u1.test', sep='\t', header=None) test = pd.DataFrame(u_test1[1].drop_duplicates()) train_desc = [ items_info[items_info['movie id'] == df[1]] ['movie desription'].values[0] for i, df in train.iterrows() ] test_desc = [ items_info[items_info['movie id'] == df[1]] ['movie desription'].values[0] for i, df in test.iterrows() ] # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result if not tfidfModel: print('traning tf-idf model ...') tfidf_model = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) tfidf_model.fit(train_desc) print('- Saving tf-idf model ...') save_model('model/tfidf_model.pickle', tfidf_model) else: print('# Loading tf-idf model ...') tfidf_model = load_model(tfidfModel) if not tfidfMatrix: item_feature_matrix = tfidf_model.transform(train_desc) # 1.2. saving tf-idf matrix result print('- Saving tf-idf matrix result ...') save_model('result/item_feature_matrix.pickle', item_feature_matrix) else: print('# Loading tf-idf matrix result ...') item_feature_matrix = load_model(tfidfMatrix) if not dbn_model: dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 1000, 1000, 500], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') # 2.2. fit dbn model dbn.fit(item_feature_matrix.A) # 2.3. save dbn model print('saving DBN model ...') dbn.save('model/dbn-model.pkl') else: print('Loadin DBN model') dbn = UnsupervisedDBN.load(dbn_model) # 3. Clustering with k-mens and save model and results if not kmeans_model: kmeans = KMeans(n_clusters=5, random_state=0).fit( dbn.transform(item_feature_matrix.A)) print('saving k-means model ...') save_model('model/kmeans-model.pkl', kmeans) else: print('loading k-means model ...') kmeans = load_model(kmeans_model) print("Done!")