예제 #1
0
파일: classify.py 프로젝트: wavefier/wdfml
    def PreprocessingRBM(self, components, MNE_coefficients, N_neighbors):
        """
        :type MNE_coefficients: int

        :param MNE_coefficients: number of coefficnents for mns projection

        :param N_neighbors: number of neighbors for embedding
        """
        self.MNE_coefficients = MNE_coefficients

        self.N_neighbors = N_neighbors

        self.rbm = neural_network.BernoulliRBM(n_components=components,
                                               learning_rate=0.05,
                                               batch_size=10,
                                               n_iter=100,
                                               verbose=0,
                                               random_state=0)

        self.Embedding = manifold.SpectralEmbedding(
            n_components=self.MNE_coefficients,
            affinity='nearest_neighbors',
            gamma=None,
            random_state=0,
            n_neighbors=self.N_neighbors)
        self.X_rbm = self.rbm.fit_transform(self.Waves_Coefficients)
        self.X_red = self.Embedding.fit_transform(self.X_rbm)
        return self.X_red
예제 #2
0
        out_log.flush()
        #
        # looping over all parameters combinations
        for n_hidden in n_hidden_values:
            for l_rate in learning_rate_values:
                for batch_size in batch_size_values:
                    for n_iters in n_iter_values:

                        logging.info('Learning RBM for {} {} {} {}'.format(
                            n_hidden, l_rate, batch_size, n_iters))
                        #
                        # learning
                        rbm = neural_network.BernoulliRBM(
                            n_components=n_hidden,
                            learning_rate=l_rate,
                            batch_size=batch_size,
                            n_iter=n_iters,
                            verbose=args.verbose - 1,
                            random_state=rand_gen)
                        fit_s_t = perf_counter()
                        rbm.fit(train)
                        fit_e_t = perf_counter()
                        logging.info('Trained in {} secs'.format(fit_e_t -
                                                                 fit_s_t))

                        #
                        # evaluating training
                        eval_s_t = perf_counter()
                        train_plls = rbm.score_samples(train)
                        eval_e_t = perf_counter()
                        train_avg_pll = numpy.mean(train_plls)
예제 #3
0
# In Sample R2
ens7_insample_pred = ens7.predict(df)
print(r2_score(train.y, ens7_insample_pred ))  #

# Predict
ens7_pred = ens7.predict(df_test) # LB:

submission         = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
submission.y       = ens7_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_nn_mlp.csv', index=False)

print("Ensemble Model 8: neural_network.BernoulliRBM(")
ens8  = neural_network.BernoulliRBM(n_components=256, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0,
                                    random_state=None)

ens8.fit(df, train.y)

# In Sample R2
ens8_insample_pred = ens8.predict(df)
print(r2_score(train.y, ens8_insample_pred))  #

# Predict
ens8_pred = ens8.predict(df_test) # LB:

submission         = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
submission.y       = ens8_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_nn_rbm.csv', index=False)
def predict(train_list, train_result, test_list, method_list, **kwargs):
    def fit_predict_each_output(model, target):
        __predict_result = []
        for idx in range(np.size(target, 1)):
            model.fit(train_list, target[:, idx])
            __predict_result.append(model.predict(test_list))
        return np.transpose(np.asarray(__predict_result))

    def fit_predict(model, target):
        model.fit(train_list, target)
        return model.predict(test_list)

    from_bins_idx = kwargs["from_bins_idx"]
    to_bins_idx = kwargs["to_bins_idx"]
    _binned_train_result = to_bins_idx(train_result)

    _predict_result = []
    if "current" in method_list:
        rbm = neural_network.BernoulliRBM(n_components=512, verbose=False, n_iter=100, learning_rate=1e-2, random_state=0)
        rbm.fit(train_list)
        rbm.fit(test_list)
        _predict_result.append(np.transpose(np.asarray(__predict_result)))
    elif "knn" in method_list:
        _ = knn_predict(train_list, _binned_train_result, test_list, k=kwargs["k"])
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "dt" in method_list:
        _ = fit_predict(tree.DecisionTreeClassifier(max_depth=kwargs["max_depth"]), _binned_train_result)
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "rf" in method_list:
        _ = fit_predict(ensemble.RandomForestClassifier(n_estimators=kwargs["n_estimators"], max_depth=kwargs["max_depth"], n_jobs=kwargs["n_jobs"]), _binned_train_result)
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "average" in method_list:
        _ = average_predict(train_result, test_list)
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "adaboost" in method_list:
        _ = fit_predict_each_output(ensemble.AdaBoostClassifier(), _binned_train_result)
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "ridge" in method_list:
        _ = fit_predict_each_output(linear_model.RidgeClassifier(), _binned_train_result)
        _predict_result.append(from_bins_idx(np.asarray(_, dtype=int)))
    elif "linear" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.LinearRegression(), train_result))
    elif "huber" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.HuberRegressor(), train_result))
    elif "theilsen" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.TheilSenRegressor(), train_result))
    elif "lasso" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.Lasso(), train_result))
    elif "par" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.PassiveAggressiveRegressor(C=kwargs["par_C"], epsilon=kwargs["par_eps"]), train_result))
    elif "ridge_reg" in method_list:
        _predict_result.append(fit_predict_each_output(linear_model.Ridge(), train_result))
    elif "dt_reg" in method_list:
        _predict_result.append(fit_predict(tree.DecisionTreeRegressor(max_depth=kwargs["max_depth"]), train_result))
    elif "rf_reg" in method_list:
        _predict_result.append(fit_predict(ensemble.RandomForestRegressor(max_depth=kwargs["max_depth"], n_jobs=kwargs['n_jobs'], n_estimators=kwargs['n_estimators']), train_result))
    elif "xgboost" in method_list:
        _predict_result.append(fit_predict_each_output(xgb.XGBClassifier(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), _binned_train_result))
    elif "xgboost_reg" in method_list:
        _predict_result.append(fit_predict_each_output(xgb.XGBRegressor(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), train_result))
    elif "svr" in method_list:
        _predict_result.append(fit_predict_each_output(svm.SVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result))
    elif "linear_svr" in method_list:
        _predict_result.append(fit_predict_each_output(svm.LinearSVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result))
    else:
        assert False, "invalid method"
    return np.asarray(_predict_result)
def main(K, numfeatures, sample_file, num_display_words, outputfile):
    K_clusters = K
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=numfeatures,
                                 min_df=2,
                                 stop_words='english',
                                 ngram_range=(1, 2),
                                 use_idf=True)

    text = []
    with open(sample_file, 'r') as f:
        text = f.readlines()

    labels = []

    with open('hygiene.dat.labels', 'r') as f:
        labels = f.readlines()

    labels = map(str.strip, labels)

    text_reference = []

    #with open ('hygiene.data.additional', 'r') as f:
    text_reference = pd.read_csv(
        'hygiene.dat.additional',
        names=['cuisines', 'zip_code', 'number_of_reviews', 'average_rating'],
        header=None)

    label_encoder = preprocessing.LabelEncoder()

    transformed_zip_codes = label_encoder.fit_transform(
        text_reference['zip_code'])

    cuisines_dict_list = []

    for cuisine_list in text_reference['cuisines']:
        a_cuisines_dict = {}
        for a_cuisine in cuisine_list.replace('[', '').replace(
                ']', '').strip().split(','):
            a_cuisines_dict[a_cuisine.strip()] = 1
        cuisines_dict_list.append(a_cuisines_dict)

    vec = DictVectorizer()

    test_cuisine_features = vec.fit_transform(cuisines_dict_list).toarray()

    test_cuisine_features = pd.DataFrame(test_cuisine_features)

    total_cuisine_features = len(vec.get_feature_names())
    cuisines_positive_corr = {}
    for i in range(total_cuisine_features):
        #pearsons_coeff=np.corrcoef(map(int,test_cuisine_features[i][:546]),map(int,labels[:546]))[0, 1]
        pearsons_coeff = scipy.stats.pearsonr(
            map(int, test_cuisine_features[i][:546]), map(int, labels[:546]))
        if pearsons_coeff > 0:
            print vec.get_feature_names()[i]
            cuisines_positive_corr[vec.get_feature_names()[i]] = 1

    #Now consider cuisines with only positive correlation
    cuisines_dict_list = []

    for cuisine_list in text_reference['cuisines']:
        a_cuisines_dict = {}
        for a_cuisine in cuisine_list.replace('[', '').replace(
                ']', '').strip().split(','):
            if a_cuisine.strip() in cuisines_positive_corr:
                a_cuisines_dict[a_cuisine.strip()] = 1
        cuisines_dict_list.append(a_cuisines_dict)

    transformed_cuisines_features = vec.fit_transform(
        cuisines_dict_list).toarray()

    #logging.basicConfig(format='%asctime)s: %(levelname)s : %(message)s',level=logging.INFO)

    #t0 = time()
    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    X = vectorizer.fit_transform(text)
    #print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)

    # mapping from feature id to acutal word
    id2words = {}
    for i, word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word

    #t0 = time()
    print("Applying topic modeling, using LDA")
    print(str(K_clusters) + " topics")
    corpus = matutils.Sparse2Corpus(X, documents_columns=False)
    lda = models.ldamodel.LdaModel(corpus,
                                   num_topics=K_clusters,
                                   id2word=id2words,
                                   iterations=1000)
    #print("done in %fs" % (time() - t0))
    doc_topics_list = lda.get_document_topics(corpus)

    doc_topics = sparse.csr_matrix((13299, K_clusters))

    for i, doc_a in enumerate(doc_topics_list):
        for (my_topic_a, weight_a) in doc_a:
            doc_topics[i, my_topic_a] = weight_a
        #doc_topics[i,K_clusters] =  transformed_zip_codes[i]
        #doc_topics[i,(K_clusters+1)] = text_reference['number_of_reviews'][i]
        #doc_topics[i,(K_clusters+2)] = text_reference['average_rating'][i]

    #doc_topics=sparse.hstack([doc_topics,transformed_cuisines_features]).todense()
    zip_codes_sparse = sparse.csr_matrix(transformed_zip_codes).transpose()

    num_of_reviews_sparse = sparse.csr_matrix(
        text_reference['number_of_reviews']).transpose()

    avg_rating_sparse = sparse.csr_matrix(
        text_reference['average_rating']).transpose()

    doc_topics_features = sparse.hstack([
        doc_topics, transformed_cuisines_features, zip_codes_sparse,
        num_of_reviews_sparse, avg_rating_sparse
    ]).todense()

    #output_text = []
    #for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
    #   output_text.append("Topic: " + str(i))
    #   for weight,term in item:
    #       output_text.append( term + " : " + str(weight) )

    #print "writing topics to file:", outputfile
    #with open ( outputfile, 'w' ) as f:
    #    f.write('\n'.join(output_text))

    clf = neural_network.BernoulliRBM()
    #start = time.time()
    train_docs_vectors = doc_topics_features[:546]
    train_labels = labels[:546]
    train_labels = map(int, train_labels)
    #cols=train_docs_vectors.columns[1:]

    #predicted_label=['CG']
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_docs_vectors, train_labels, test_size=0.3, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(cm)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('Normalized confusion matrix')
    print(cm_normalized)

    print(
        cross_validation.cross_val_score(clf,
                                         train_docs_vectors,
                                         train_labels,
                                         cv=20,
                                         scoring="f1_macro").mean())
    #print(metrics.accuracy_score(train_labels, predicted))

    #print(f1_score(y_test, y_pred, average='binary'))
    #lb = preprocessing.LabelBinarizer()
    #y_test=lb.fit(y_test)
    #y_pred=lb.fit(y_pred)
    #print(cross_validation.cross_val_score(clf,y_test,y_pred,scoring="f1"))

    test_docs_vectors = doc_topics_features[546:]

    predicted_labels = (clf.predict(test_docs_vectors)).tolist()

    output_text = ["CG"]
    for label in predicted_labels:
        output_text.append(str(label))

    print "writing predicted labels to file: competition.txt"
    with open('competition.txt', 'w') as f:
        f.writelines("%s\n" % item for item in output_text)
예제 #6
0
                                title="Non-Hinge dir")
    images = getImages(hingeDir, nonhingeDir)  #Getting training data
    print "Got the images!"

    # print zip(*images[1])
    shuffle(images)
    # print zip(*images[1])
    trainingImages = images[len(images) //
                            2:]  #Splits images into testing and training sets
    testingImages = images[:len(images) // 2]
    # classifier = svm.SVC(gamma=userGamma, tol=tolerence)
    print "Initializing NN!"
    # classifier = neural_network.MLPClassifier(hidden_layer_sizes=(400, 300, 200, 100), solver="lbfgs", max_iter=10000, alpha=.0001, activation="tanh", verbose=False)
    # classifier = tree.DecisionTreeClassifier(criterion="gini", splitter="best", max_features="auto")
    logistic = linear_model.LogisticRegression()
    rbm = neural_network.BernoulliRBM()
    print("Starting to fit, hold on tight!")
    # print zip(*trainingImages)[1]
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    rbm.learning_rate = 0.03
    rbm.n_iter = 15
    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = 100
    logistic.C = 6000.0

    # Training RBM-Logistic Pipeline
    classifier.fit(zip(*trainingImages)[0], zip(*trainingImages)[1])

    # Training Logistic regression
예제 #7
0
    def dtc(self):

        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])

        # 获取 N_components
        if self.nc_edit.text().strip():
            nc = int(self.nc_edit.text())
        else:
            nc = 256
        # 获取 Learning_rate
        if self.le_edit.text().strip():
            le = float(self.le_edit.text())
        else:
            le = 0.1
        # 获取 Batch_size
        if self.ba_edit.text().strip():
            ba = int(self.ba_edit.text())
        else:
            ba = 10
        # 获取 N_iter
        if self.ni_edit.text().strip():
            ni = int(self.ni_edit.text())
        else:
            ni = 10
        # 获取 Verbose
        if self.ve_edit.text().strip():
            ve = int(self.ve_edit.text())
        else:
            ve = 0
        #LDA算法实现
        self.clf = neural_network.BernoulliRBM(batch_size=ba,
                                               learning_rate=le,
                                               n_components=nc,
                                               n_iter=ni,
                                               random_state=None,
                                               verbose=ve)
        #训练模型
        self.clf.fit(self.x_train)
        #对训练集中x进行转换
        self.train_x = self.clf.transform(self.x_train)
        #对测试集中x进行转换
        self.test_x = self.clf.transform(self.x_test)
        '''
        该模块是对dtable_train模块进行设置,即显示训练集的训练结果
        '''
        #设置单元格的行数和列数
        self.dtable_train.setRowCount(len(self.train_x))
        self.dtable_train.setColumnCount(len(self.train_x[0]))

        for s in range(len(self.train_x)):
            if s / 2.0 == 0:
                for s01 in range(len(self.train_x[0])):
                    self.dtable_train.setItem(
                        s, s01,
                        QtGui.QTableWidgetItem(str(self.train_x[s][s01])))
                    self.dtable_train.item(s, s01).setBackgroundColor(
                        QtGui.QColor(214, 71, 0))
            else:
                for s01 in range(len(self.train_x[0])):
                    self.dtable_train.setItem(
                        s, s01,
                        QtGui.QTableWidgetItem(str(self.train_x[s][s01])))
        '''
        该模块是对dtable_test模块进行设置,显示测试集的测试结果
        '''
        #设置单元格的行数和列数
        self.dtable_test.setRowCount(len(self.test_x))
        self.dtable_test.setColumnCount(len(self.test_x[0]))

        for s in range(len(self.test_x)):
            if s / 2.0 == 0:
                for s01 in range(len(self.test_x[0])):
                    self.dtable_test.setItem(
                        s, s01,
                        QtGui.QTableWidgetItem(str(self.test_x[s][s01])))
                    self.dtable_test.item(s, s01).setBackgroundColor(
                        QtGui.QColor(214, 71, 0))
            else:
                for s01 in range(len(self.test_x[0])):
                    self.dtable_test.setItem(
                        s, s01,
                        QtGui.QTableWidgetItem(str(self.test_x[s][s01])))
예제 #8
0
    #load params, transform data_train, data_test

else:
    print 'extracting features with SGVB auto-encoder, default is 10 iterations...'
    encoder = sgvb.SGVB(verbose=True)
    encoder.fit(x_train)
    SGVB_train_features = encoder.transform(data_train)
    SGVB_test_features = encoder.transform(data_test)
    print 'done'

    print 'extracting features with RBM...'
    n_components = 200
    learning_rate = 0.01
    batch_size = 100
    rbm = neural_network.BernoulliRBM(n_components,
                                      learning_rate,
                                      batch_size,
                                      verbose=True)
    rbm.fit(x_train)
    RBM_train_features = rbm.transform(data_train)
    RBM_test_features = rbm.transform(data_test)
    print 'done'

print 'performing logistic regression on raw data...'
LogReg_raw = linear_model.LogisticRegression()
LogReg_raw.fit(data_train, t_train)
raw_score = LogReg_raw.score(data_test, t_test)
print 'Test score on raw data = ', raw_score

print 'performing logistic regression on RBM features...'
LogReg_RBM = linear_model.LogisticRegression()
LogReg_RBM.fit(RBM_train_features, t_train)