예제 #1
0
def main():
    # 0. read data and splite it to 80% for training and 20% for testing
    items = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1')
    
    print items.shape
    
    items_train, items_test = train_test_split(items, train_size=0.8, random_state=0)
    
    print items_train.shape, items_test.shape


    # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result
    if not os.path.isfile('model/tfidf_model.pickle'):
        print('traning tf-idf model ...')
        tfidf_model = TfidfVectorizer(norm='l2',min_df=0, use_idf=True,max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser)
        item_feature_matrix = tfidf_model.fit_transform(items_train['movie desription'].values.astype('U'))
        print('#1. dimension of the item-feature matrix', item_feature_matrix.shape)

        # 1.1 saving tf-idf model
        print('Saving tf-idf model ...')
        save_model('model/tfidf_model.pickle', tfidf_model)
    
    if not os.path.isfile('result/item_feature_matrix.pickle'):
        # 1.2. saving tf-idf matrix result
        print('Saving tf-idf matrix result ...')
        save_model('result/item_feature_matrix.pickle', item_feature_matrix)

    # 2. train dbn model and save the model into model/dbn.pickle
    # 2.1. load tf-idf result
    print('loading item feature matrix ...')
    item_feature_matrix = load_model('result/item_feature_matrix.pickle')
    
    if not os.path.isfile('model/dbn-model.pkl'):
        dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400],
                              batch_size=10,
                              learning_rate_rbm=0.06,
                              n_epochs_rbm=20,
                              activation_function='sigmoid')
        # 2.2. fit dbn model
        dbn.fit(item_feature_matrix.A)
        # 2.3. save dbn model
        print('saving DBN model ...')
        dbn.save('model/dbn-model.pkl')
        
    print('Loadin DBN model')
    dbn = UnsupervisedDBN.load('model/dbn-model.pkl')

    # 3. Clustering with k-mens and save model and results
    if not os.path.isfile('model/kmeans-model.pkl'):
        kmeans = KMeans(n_clusters=5, random_state=0).fit(dbn.transform(item_feature_matrix.A))
        print('saving k-means model ...')
        save_model('model/kmeans-model.pkl', kmeans)
    else:
        kmeans = load_model('model/kmeans-model.pkl')
    
    print(kmeans.labels_)
예제 #2
0
print('X_test size : {0} \n'.format(X_test.shape))

# Models we will use
dbn = UnsupervisedDBN(hidden_layers_structure=[256, 512],
                      batch_size=32,
                      learning_rate_rbm=0.06,
                      learning_rate_backprop=1e-3,
                      n_epochs_rbm=50,
                      n_epochs_fine_tune=500,
                      activation_function='sigmoid',
                      contrastive_divergence_iter=1)

###############################################################################

# Training RBM-Logistic Pipeline
dbn.fit(X_train)
# Save the training metrics
for layer_wise_error, index in zip(dbn.layer_wise_error,
                                   range(len(dbn.layer_wise_error))):
    with io.open("layer_" + str(index), 'wb') as f:
        pickle.dump(layer_wise_error, f)

# Fine tune the DBN using the reconstruction MSE (over pixels)
recon_error_test, recon_error_train = dbn.fine_tune(X_train, X_test)

# Save fine tuned parameters
with io.open("test_recon_finetune", 'wb') as f:
    pickle.dump(recon_error_test, f)

with io.open("train_recon_finetune", 'wb') as f:
    pickle.dump(recon_error_train, f)
예제 #3
0
class DBN_AE:
    def __init__(self,
                 n,
                 max_autoencoder_size=10,
                 FM_grace_period=None,
                 AD_grace_period=10000,
                 dbn_layers=[100, 75, 50, 35, 16],
                 learning_rate=0.1,
                 hidden_ratio=0.75,
                 feature_map=None):
        self.AD_grace_period = AD_grace_period
        self.FM_grace_period = FM_grace_period
        self.lr = learning_rate
        self.hr = hidden_ratio
        self.n = n

        self.n_trained = 0  # the number of training instances so far
        self.n_executed = 0  # the number of executed instances so far
        self.dbn_batch = 10000
        self.dbn_layers = dbn_layers
        self.AE_dim = dbn_layers[-1]
        self.__createDBN__()
        self.__createAE__()
        self.fvs = []
        print("Deep Belief Network: train-mode, Auto-Encoder: off-mode")

    def __createAE__(self):
        params = AE.dA_params(self.AE_dim,
                              n_hidden=0,
                              lr=self.lr,
                              corruption_level=0,
                              gracePeriod=0,
                              hiddenRatio=self.hr)
        self.AE = AE.dA(params)

    def __createDBN__(self):
        self.FM = UnsupervisedDBN(hidden_layers_structure=self.dbn_layers,
                                  batch_size=512,
                                  learning_rate_rbm=0.3,
                                  n_epochs_rbm=64,
                                  activation_function='sigmoid',
                                  verbose=False)

    def process(self, x):
        if self.n_trained < self.FM_grace_period + self.AD_grace_period:
            self.train(x)
            return 0.0
        else:
            return self.execute(x)

    def train_FM(self, x):
        self.fvs.append(x)
        if len(self.fvs) == self.dbn_batch:
            xx = np.array(self.fvs)
            self.FM.fit(xx)
            self.fvs.clear()

    def train(self, x):
        if self.n_trained < self.FM_grace_period:
            self.train_FM(x)
        else:
            S_l1 = self.FM.transform(x)
            self.AE.train(S_l1)
        self.n_trained += 1
        if self.n_trained == self.AD_grace_period + self.FM_grace_period:
            print(
                "Deep Belief Network: execute-mode, Auto-Encoder: train-mode")

    def execute(self, x):
        self.n_executed += 1
        S_l1 = self.FM.transform(x)
        return self.AE.execute(S_l1)
예제 #4
0
파일: test_.py 프로젝트: gaaragots/e
                       header=None,
                       encoding='ISO-8859-1')
u_item_DF['movie desription'] = [val[2] for i, val in data_new.iterrows()]

sklearn_tfidf = TfidfVectorizer(norm='l2',
                                min_df=0,
                                use_idf=True,
                                max_features=5000,
                                smooth_idf=False,
                                sublinear_tf=True,
                                tokenizer=tokeniser)
item_feature_matrix = sklearn_tfidf.fit_transform(
    u_item_DF['movie desription'].values.astype('U'))
print('dimension of the item-feature matrix', item_feature_matrix.shape)

# Train DBN model
from dbn.models import UnsupervisedDBN

#[4604, 2000, 4000, 3000, 1000]
dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400],
                      batch_size=10,
                      learning_rate_rbm=0.06,
                      n_epochs_rbm=20,
                      activation_function='sigmoid')
dbn.fit(item_feature_matrix.A)

# Save the model
print('Saving Model ...')
dbn.save('model-1.pkl')
print('Model Saved')
예제 #5
0
def main(tfidfModel=None, tfidfMatrix=None, dbn_model=None, kmeans_model=None):
    # 0. read data and splite it to 80% for training and 20% for testing
    items_info = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1')
    u_base1 = pd.read_csv('input/u1.base', sep='\t', header=None)
    train = pd.DataFrame(u_base1[1].drop_duplicates())
    u_test1 = pd.read_csv('input/u1.test', sep='\t', header=None)
    test = pd.DataFrame(u_test1[1].drop_duplicates())

    train_desc = [
        items_info[items_info['movie id'] == df[1]]
        ['movie desription'].values[0] for i, df in train.iterrows()
    ]
    test_desc = [
        items_info[items_info['movie id'] == df[1]]
        ['movie desription'].values[0] for i, df in test.iterrows()
    ]

    # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result
    if not tfidfModel:
        print('traning tf-idf model ...')
        tfidf_model = TfidfVectorizer(norm='l2',
                                      min_df=0,
                                      use_idf=True,
                                      max_features=5000,
                                      smooth_idf=False,
                                      sublinear_tf=True,
                                      tokenizer=tokeniser)
        tfidf_model.fit(train_desc)
        print('- Saving tf-idf model ...')
        save_model('model/tfidf_model.pickle', tfidf_model)
    else:
        print('# Loading tf-idf model ...')
        tfidf_model = load_model(tfidfModel)

    if not tfidfMatrix:
        item_feature_matrix = tfidf_model.transform(train_desc)
        # 1.2. saving tf-idf matrix result
        print('- Saving tf-idf matrix result ...')
        save_model('result/item_feature_matrix.pickle', item_feature_matrix)
    else:
        print('# Loading tf-idf matrix result ...')
        item_feature_matrix = load_model(tfidfMatrix)

    if not dbn_model:
        dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 1000, 1000, 500],
                              batch_size=10,
                              learning_rate_rbm=0.06,
                              n_epochs_rbm=20,
                              activation_function='sigmoid')
        # 2.2. fit dbn model
        dbn.fit(item_feature_matrix.A)
        # 2.3. save dbn model
        print('saving DBN model ...')
        dbn.save('model/dbn-model.pkl')
    else:
        print('Loadin DBN model')
        dbn = UnsupervisedDBN.load(dbn_model)

    # 3. Clustering with k-mens and save model and results
    if not kmeans_model:
        kmeans = KMeans(n_clusters=5, random_state=0).fit(
            dbn.transform(item_feature_matrix.A))
        print('saving k-means model ...')
        save_model('model/kmeans-model.pkl', kmeans)
    else:
        print('loading k-means model ...')
        kmeans = load_model(kmeans_model)

    print("Done!")