예제 #1
0
def training_dbn(train_dataset, n_targets=2, learn_rates=0.3, learn_rate_decays=0.9, epochs=1000, n_hidden_layers=5, n_hidden_layer_nodes=100, 
                 verbose=True):
    layers = np.ones(n_hidden_layers, dtype=int) * n_hidden_layer_nodes
    print(layers.tolist())
    X_train, y_train = train_dataset
    ff = [X_train.shape[1]]
    ff.extend(layers.tolist())
    ff.append(n_targets)
    # Create the dbn
    clf = DBN(
              ff,
              learn_rates=learn_rates,
              learn_rate_decays=learn_rate_decays,
              epochs=epochs,
              dropouts=0.1,
              verbose=verbose)

    # Counting time for training
    start = time.time()
    clf.fit(X_train, y_train) # training
    end = time.time()

    exec_time = end - start
    print('Exec time was {} secs'.format(exec_time))
    return clf, exec_time
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  clf = DBN([X_train_trunc.shape[1], k, 4], learn_rates=0.3, learn_rate_decays=0.9, epochs=epochs, verbose=1)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
def train_model(data_set_path='/home/devin.fisher/Kingdoms/treadstone/_samples/still_data/still_training_data.pkl'):
    # data_set = None
    with open(data_set_path, 'rb') as f:
        data_set = pickle.load(f)

    # with open('/home/devin.fisher/Kingdoms/lol/still_training_data2.pkl', 'rb') as f:
    #     data_set = pickle.load(f)

    # (train_x, test_x, train_y, test_y) = train_test_split(data_set['data'], data_set['target'], test_size=0.1)

    train_x = data_set['data']
    test_x = data_set['data']
    train_y = data_set['target']
    test_y = data_set['target']

    dbn = DBN(
        [-1, 300, -1],
        learn_rates=0.3,
        learn_rate_decays=0.9,
        epochs=60,
        verbose=1)
    dbn.fit(train_x, train_y)

    joblib.dump(dbn, 'digit_model.pkl', compress=9)

    # dbn = joblib.load('digit_model.pkl')

    # compute the predictions for the test data and show a classification report
    preds = dbn.predict(test_x)
    print classification_report(test_y, preds)
예제 #4
0
def main():
  data_id = 'B'
  data_path = '/broad/compbio/maxwshen/data/1-MAKETRAINTEST/complete/golf/'
  
  print 'train...', datetime.datetime.now()
  train_set = readin(data_id, 'train', data_path)
  print 'valid...', datetime.datetime.now()
  valid_set = readin(data_id, 'valid', data_path)
  print 'test...', datetime.datetime.now()
  test_set = readin(data_id, 'test', data_path)

  # Input to 300 node RBM to 2 node output
  dbn = DBN( \
    [xtrain.shape[1], 300, 2], \
    learn_rates = 5, \
    learn_rate_decays = 0.9, \
    epochs = 31, \
    verbose = 1)
  dbn.fit(dat_train, y_train)

  preds = dbn.predict(dat_test)
  print classification_report(y_test, preds)

  out_fn = 'dbn.pickle'
  with open(out_fn, 'w') as f:
    pickle.dump(dbn, out_fn)

  return
예제 #5
0
파일: nn.py 프로젝트: larisahax/Dialect
def train(X, Y, alphabet):
    model = DBN([13, 1000, len(alphabet)],
    learn_rates=0.3,
    learn_rate_decays=0.9,
    epochs=10,
    verbose=1,)

    model.fit(X, Y)
    return model
예제 #6
0
def run():
    X_train, Y_train = load_training_data()

    X_train, Y_train = rotate_dataset(X_train, Y_train, 8)
    X_train, Y_train = nudge_dataset(X_train, Y_train)

    n_features = X_train.shape[1]
    n_classes = 10
    classifier = DBN([n_features, 8000, n_classes], 
        learn_rates=0.4, learn_rate_decays=0.9 ,epochs=75, verbose=1)

    classifier.fit(X_train, Y_train)

    test_data = get_test_data_set()
    predictions = classifier.predict(test_data)
    write_predictions_to_csv(predictions)
예제 #7
0
파일: clf.py 프로젝트: BellyWong/redigit
    def __init__(self):
        # images_train=data_train[:,1:]
        # trainX, _trainX, trainY, _trainY = train_test_split(images_train/255.,values_train,test_size=0.5)

        # #load test.csv
        # test = pd.read_csv("data/test.csv")
        # data_test=test.as_matrix()
        # testX, _testX = train_test_split(data_test/255.,test_size=0.99)
        
        # Random Forest
        # self.clf = RandomForestClassifier()
        
        # Stochastic Gradient Descent
        # self.clf = SGDClassifier()
        
        # Support Vector Machine
        # self.clf = LinearSVC()
        
        # Nearest Neighbors
        # self.clf = KNeighborsClassifier(n_neighbors=13)
        
        
        train = pd.read_csv("data/train.csv")
        data_train=train.as_matrix()
        values_train=data_train[:,0]
        images_train=data_train[:,1:]
        trainX, _trainX, trainY, _trainY = train_test_split(images_train/255.,values_train,test_size=0.995)
        
        # Neural Network
        self.clf = DBN([trainX.shape[1], 300, 10],learn_rates=0.3,learn_rate_decays=0.9,epochs=10,verbose = 1)
        
        #Training
        self.clf.fit(trainX, trainY)
        
        pass
예제 #8
0
def train_clf(dim, X, y, classificator):
    print("Training for {} classes".format(dim[2]))
    if classificator == "DBN":
        clf = DBN(dim,
                  learn_rates=dbn_learn_rates,
                  learn_rate_decays=dbn_learn_rate_decays,
                  epochs=dbn_epochs,
                  minibatch_size=dbn_minibatch_size,
                  verbose=dbn_verbose,
                  dropouts=dbn_dropouts
              )
    elif classificator == "GaussianNB":
        clf = GaussianNB()

    clf.fit(X, y)

    return clf
예제 #9
0
def train_dbn_dataset(dataset, x_test, y_test, alpha, nhidden, epochs, batch_size, noises=[]):
    from nolearn.dbn import DBN
    num_classes = len(set(y_test))
    print "Number of classes", num_classes
    x_train, y_train = dataset
    dbn_model = DBN([x_train.shape[1], nhidden, num_classes],
                    learn_rates = alpha,
                    learn_rate_decays = 0.9,
                    epochs = epochs,
                    verbose = 1,
                    nesterov=False,
                    minibatch_size=batch_size,
                    noises = noises)

    dbn_model.fit(x_train, y_train)
    from sklearn.metrics import classification_report, accuracy_score
    y_true, y_pred = y_test, dbn_model.predict(x_test) # Get our predictions
    print(classification_report(y_true, y_pred)) # Classification on each digit
    print(roc_auc_score(y_true, y_pred)) # Classification on each digit
    return y_pred, roc_auc_score(y_true, y_pred)
예제 #10
0
def dbn_clf(X, y, hidden_sizes=[300], num_epochs=10):
    """ deep belief network """
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=0)
    output_categories = np.load(os.path.join(loaddir,'submit_col_name.npy'))

    print('Start training Neural Network...')

    dbn = DBN(
        [Xtrain.shape[1]] + hidden_sizes + [len(output_categories)],
        learn_rates = 0.3,
        learn_rate_decays = 0.9,
        epochs = num_epochs,
        verbose = 1)
    dbn.fit(Xtrain, ytrain)
    
    ypred = dbn.predict_proba(Xtest)
    score = log_loss(ytest, ypred)
    print('Log loss = {}'.format(score))

    return dbn, score
예제 #11
0
	 def test(self):
                 #iris = datasets.load_iris()
                 #X, y = iris.data, iris.target
                 X, y = self.dataMat,self.labelMat
                 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.6, random_state=12)
                 #clf = RandomForestClassifier(max_depth=6,min_samples_split=9,min_samples_leaf=15,n_estimators=5)
                 #clf = DBN([X.shape[1], 24, 2],scales=0.5,learn_rates=0.02,learn_rate_decays = 0.95, learn_rate_minimums =0.001,epochs=500,l2_costs = 0.02*0.031, dropouts=0.2,verbose=0)
                 #cvnum = ShuffleSplit(2013,n_iter=10,test_size=0.6,train_size=0.4,random_state=0)
                 for scal in arange(4.5, 5.0, 0.5):
                     print "**************************************************************"
                     print "DBN scal=",scal
                     clf = DBN([X.shape[1], 24,48, 2],scales=0.5,learn_rates=0.01,learn_rate_decays = 0.95, learn_rate_minimums =0.001,epochs=50,l2_costs = 0.02*0.001, dropouts=0.0,verbose=0)
                     clf.fit(X_train, y_train);
                     scores = cross_val_score(clf,X,y,cv=3,scoring='roc_auc')
                     y_pred = clf.predict(X_test);
                     y_predprob = clf.predict_proba(X_test);
                     prf=precision_recall_fscore_support(y_test, y_pred, average='binary')
                     print ("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
                     print  classification_report(y_test,y_pred)
                     print 'The accuracy is: ', accuracy_score(y_test,y_pred)
                     print 'The log loss is:', log_loss(y_test, y_predprob)
                     print 'The ROC score is:', roc_auc_score(y_test,y_predprob[:,1])
예제 #12
0
파일: dbn.py 프로젝트: maxwshen/Kellis
def main():
    data_fn = "/home/ec2-user/Kellis/data/bravo.formatted/dat.all.txt"
    blacklist_fn = "/home/ec2-user/Kellis/data/bravo.formatted/dat.blacklist.txt"
    y_fn = "/home/ec2-user/Kellis/data/bravo.formatted/dat.y.txt"

    data = read_delimited_txt(data_fn, "\t")
    blacklist = read_delimited_txt(blacklist_fn, "\t")
    y = read_delimited_txt(y_fn, "\t")

    # Get names and remove the first element of each row which is the row number
    names = data[0]
    data = data[1:]
    for i in range(len(data)):
        data[i] = data[i][1:]

    y = y[1:]
    for i in range(len(y)):
        y[i] = y[i][-1]
    y = convert_y_binary(y)

    # Normalizes column-wise so all values are between 0 and 1
    data = normalize_0_1(data)

    # Split into training, testing
    xtrain, xtest, ytrain, ytest = train_test_split(data, y, test_size=0.2, random_state=1)

    # Input to 300 node RBM to 2 node output
    dbn = DBN([xtrain.shape[1], 300, 2], learn_rates=5, learn_rate_decays=0.9, epochs=501, verbose=1)
    dbn.fit(xtrain, ytrain)

    preds = dbn.predict(xtest)
    print classification_report(ytest, preds)

    out_fn = "dbn.pickle"
    with open(out_fn, "w") as f:
        pickle.dump(dbn, out_fn)

    return
예제 #13
0
def runOfflineML(y, X, classifiers, savemodel=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y.astype("int0"), test_size=0.20, random_state=0)
    data = dict(x_train=X_train, x_test=X_test, y_train=y_train, y_test=y_test)
    cls_stats = initClsStats(classifiers)
    for cls_name, cls in classifiers.items():
        cls_stats[cls_name]["n_train"] = data["x_train"].shape[0]
        cls_stats[cls_name]["n_test"] = data["x_test"].shape[0]
        cls_stats[cls_name]["n_features"] = data["x_train"].shape[1]
        tick = time.time()
        if cls_name == "DBN":
            data = dataNormalise(data)
            clf = DBN([data["x_train"].shape[1], 300, 2], learn_rates=0.3, learn_rate_decays=0.9, epochs=10, verbose=1)
            clf.fit(data["x_train"], data["y_train"])
        else:
            clf = classifiers[cls_name].fit(data["x_train"], data["y_train"])
        if savemodel:
            pickle.dump(clf, open(cls_name + ".dat", "w"))
            clf = pickle.load(open(cls_name + ".dat", "r"))
        cls_stats[cls_name]["training_time"] += time.time() - tick
        # check the accuracy on the training set
        tick = time.time()
        predicted = clf.predict(data["x_test"])
        cls_stats[cls_name]["testing_time"] += time.time() - tick
        acc = metrics.accuracy_score(data["y_test"], predicted)
        cls_stats[cls_name]["accuracy"] = acc
        print cls_name, "accuracy is: " + str(acc)
        # auc = metrics.roc_auc_score(data['y_test'], probs[:, 1])
        conf_matrix = metrics.confusion_matrix(data["y_test"], predicted)
        cls_stats[cls_name]["conf_matrix"] = conf_matrix
        # print conf_matrix
        precision, recall, fscore, support = metrics.precision_recall_fscore_support(data["y_test"], predicted)
        cls_stats[cls_name]["precision"] = precision
        cls_stats[cls_name]["recall"] = recall
        cls_stats[cls_name]["fscore"] = fscore
        cls_stats[cls_name]["support"] = support
    return cls_stats
예제 #14
0
    def fit(self, X, y, X_pretrain=None):
        from nolearn.dbn import DBN

        if y.ndim == 2:
            n_outputs = y.shape[1]
        else:
            y = y[:, np.newaxis]
            n_outputs = 1

        params = dict(self.__dict__)
        from gdbn.activationFunctions import Linear
        params['output_act_funct'] = Linear()

        n_units = params.pop('n_units')
        n_hidden_layers = params.pop('n_hidden_layers')
        if isinstance(n_units, int):
            units = [n_units] * n_hidden_layers
        else:
            units = n_units
        units = [X.shape[1]] + units + [n_outputs]
        self.dbn = DBN(units, **params)
        print X.shape
        self.dbn.fit(X, y, X_pretrain=X_pretrain)
예제 #15
0
class DBNRegressor(BaseEstimator, RegressorMixin):

    def __init__(self, n_hidden_layers=2, n_units=1000, epochs=100,
                 epochs_pretrain=0, scales=0.05,
                 real_valued_vis=True,
                 use_re_lu=False,
                 uniforms=False,
                 learn_rates_pretrain=0.1,
                 learn_rates=0.1,
                 learn_rate_decays=1.0,
                 learn_rate_minimums=0.0,
                 momentum=0.9,
                 momentum_pretrain=0.9,
                 l2_costs=0.0001,
                 l2_costs_pretrain=0.0001,
                 dropouts=None,
                 minibatch_size=64,
                 verbose=2,
                 fine_tune_callback=None,
                 nest_compare=True,
                 nest_compare_pretrain=None,
                 fan_outs=None,
                 nesterov=False,
                 ):
        self.n_hidden_layers = n_hidden_layers
        self.n_units = n_units
        self.epochs = epochs
        self.epochs_pretrain = epochs_pretrain
        self.learn_rates_pretrain = learn_rates_pretrain
        self.learn_rates = learn_rates
        self.learn_rate_decays = learn_rate_decays
        self.learn_rate_minimums = learn_rate_minimums
        self.l2_costs_pretrain = l2_costs_pretrain
        self.l2_costs = l2_costs
        self.momentum = momentum
        self.momentum_pretrain = momentum_pretrain
        self.verbose = verbose
        self.real_valued_vis = real_valued_vis
        self.use_re_lu = use_re_lu
        self.scales = scales
        self.minibatch_size = minibatch_size
        if dropouts is None:
            dropouts = [0.2] + [0.5] * n_hidden_layers
        self.dropouts = dropouts
        self.fine_tune_callback = fine_tune_callback
        self.nest_compare = nest_compare
        self.nest_compare_pretrain = nest_compare_pretrain
        self.fan_outs = fan_outs
        self.nesterov = nesterov

    def fit(self, X, y, X_pretrain=None):
        from nolearn.dbn import DBN

        if y.ndim == 2:
            n_outputs = y.shape[1]
        else:
            y = y[:, np.newaxis]
            n_outputs = 1

        params = dict(self.__dict__)
        from gdbn.activationFunctions import Linear
        params['output_act_funct'] = Linear()

        n_units = params.pop('n_units')
        n_hidden_layers = params.pop('n_hidden_layers')
        if isinstance(n_units, int):
            units = [n_units] * n_hidden_layers
        else:
            units = n_units
        units = [X.shape[1]] + units + [n_outputs]
        self.dbn = DBN(units, **params)
        print X.shape
        self.dbn.fit(X, y, X_pretrain=X_pretrain)

    def predict(self, X):
        return self.dbn.chunked_decision_function(X)
                                 :class:`~gdbn.activationFunctions.Sigmoid`,
                                 :class:`~.gdbn.activationFunctions.Linear`,
                                 :class:`~.gdbn.activationFunctions.Softmax`
                                 from the
                                 :mod:`gdbn.activationFunctions`
                                 module.  Defaults to
                                 :class:`~.gdbn.activationFunctions.Softmax`.
"""

dbn = DBN(
    layer_sizes = [trainX.shape[1], 800, 10], # trainX.shape[1] is the input layer, 10 is output layer
                                            # 300 is the hidden layer
                                            # use -1 as last layer if one does not know how many labels
                                            # are there
    output_act_funct = "Softmax",
    dropouts = 0.5,
    use_re_lu=True,
    l2_costs=0.0001,
    learn_rates = 0.3,
    learn_rate_decays = 0.9,
    epochs = 10,
    loss_funct = None, # if not specified, default is the count of percentage or wrong labels, built in function
    verbose = 1)


##### Below is the trick for changing score function to evaluate the accuracy. The original program
    # does not have other options except for pure compare % of accurate outputs,
    # here one may create a function of his/her own.
import new

def _score(self, X, y):
    outputs = self.predict_proba(X)
def do_operation_(X_train,X_test,y_train,y_test,l_r,d_r):
	clf = DBN([np.shape(X_train)[1],300,10],learn_rates = l_r,learn_rate_decays = d_r,epochs = 30,verbose = 1 )
	clf.fit(X_train,y_train)
	y_test,y_pred = y_test, clf.predict(X_test)
	result = np.sum(y_test == y_pred)
	return (result,l_r,d_r)
예제 #18
0
def main():
    """."""

    from sklearn.cross_validation import KFold

    set_verbosity(3)


    overlap_df = get_data("./vectors/google_overlap.csv")
    #overlap_df = get_data("./vectors/freebase_overlap.csv")

    overlap_df = overlap_df[overlap_df.NER != 'O']
    overlap_df = overlap_df[overlap_df.NER != 'I-FAC']
    overlap_df = overlap_df[overlap_df.NER != 'B-FAC']
    overlap_df = overlap_df[overlap_df.NER != 'I-LOC']
    overlap_df = overlap_df[overlap_df.NER != 'B-LOC']
    overlap_df = overlap_df[overlap_df.NER != 'I-WEA']
    overlap_df = overlap_df[overlap_df.NER != 'B-WEA']
    overlap_df = overlap_df[overlap_df.NER != 'I-VEH']
    overlap_df = overlap_df[overlap_df.NER != 'B-VEH']
    overlap_df = overlap_df[overlap_df.NER != 'I-TTL']
    overlap_df = overlap_df[overlap_df.NER != 'B-TTL']
    #overlap_df = overlap_df.groupby("NER").filter(lambda x: len(x) > 50)


    label_map, labels = map_labels(overlap_df)
    X, y = parse_data(overlap_df, label_map)
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.10)


    count, n_folds, scores = 0, 20, []
    logging.info("Beginning Cross Validation with " + str(n_folds) + " folds")    
    
    kf = KFold(len(trainX), n_folds=n_folds)
    lrs = list(np.linspace(0.1, 0.4, num=n_folds))
    for train, test in kf:
        logging.debug("TRAIN:" + str(len(train)) + " TEST:" + str(len(test)))
        trainX_fold, validX_fold = trainX[train], trainX[test]
        trainY_fold, validY_fold = trainY[train], trainY[test]
    
        google_topology = [trainX_fold.shape[1], 300, 200, 100, len(labels)]
        #freebase_topology = [trainX_fold.shape[1], 750, 500, 250, len(labels)]

        dbn = DBN(
            #freebase_topology,
            google_topology,
            learn_rates=float(lrs[count]),
            learn_rate_decays=0.9,
            epochs=50,
            verbose=0)

        dbn.fit(trainX_fold, trainY_fold)
        score = dbn.score(validX_fold, validY_fold)
        scores.append((score, float(lrs[count])))

        count += 1
        logging.info(
            "Learning rate: " + str(float(lrs[count-1])) + " score:" + \
            str(score) + " " + str(float(count)/float(n_folds) * 100) + "% done")

    best_lr = max(scores, key=lambda x: x[0])[1]
    logging.info("Best CV score: " + str(best_lr))


    google_topology = [trainX.shape[1], 300, 200, 100, len(labels)]
    #freebase_topology = [trainX.shape[1], 750, 500, 250, len(labels)]

    dbn = DBN(
        #freebase_topology,
        google_topology,
        learn_rates=best_lr,
        learn_rate_decays=0.9,
        epochs=100,
        verbose=1)

    dbn.fit(trainX, trainY)

    preds = dbn.predict(testX)
    print classification_report(testY, preds)


    #model_and_data = (dbn, label_map)
    #dump_model(model_and_data, './google_model.pkl')

    #'''

    '''
예제 #19
0
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn import preprocessing
from skimage.feature import hog
from nolearn.dbn import DBN
import timeit

train = pd.read_csv("../train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X / 255., y, test_size=0.1, random_state=0)

clf_nn = DBN([X_train.shape[1], 300, 10],
             learn_rates=0.3,
             learn_rate_decays=0.9,
             epochs=15)
training_x = X_train.as_matrix()
training_y = y_train.as_matrix()
testing_x = X_test.as_matrix()
testing_y = y_test.as_matrix()
clf_nn.fit(training_x, training_y)
acc_nn = clf_nn.score(testing_x, testing_y)
print "neural network accuracy: ", acc_nn
x_train, x_test, y_train, y_test, x, y = train_test_prep()

#dbn_model = DBN([x_train.shape[1], 1500, 1500, 2],
#                learn_rates = 0.01,
#                learn_rate_decays = 0.9,
#                epochs = 1000,
#                verbose = 3)

dbn_model = DBN([x_train.shape[1], 5000, 2500, 1250, 500],
                    #dropouts=0.01,
                    output_act_funct=activationFunctions.Sigmoid(),
                    learn_rates=0.01,
                    learn_rates_pretrain=0.001,
#                    minibatch_size=9,
#                    learn_rate_decays=0.9,
#                    learn_rate_minimums=0.0001,
                    epochs_pretrain=500,
                    epochs=500,
#                    momentum= self.momentum,
#                    real_valued_vis=True,
#                    use_re_lu=True,
                    verbose=2)


dbn_model.fit(x_train, y_train)

y_true, y_pred = y_test, dbn_model.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit
print y_true
print y_pred
def main(input_file):
    batch_size = 128
    nb_classes = 62  # A-Z, a-z and 0-9
    nb_epoch = 2

    # Input image dimensions
    img_rows, img_cols = 32, 32

    # Path of data files
    path = input_file

    def convert_(Y):

        alpha = string.letters
        dig = string.digits
        alphaList = []
        for elem in (alpha + dig):
            alphaList.append(elem)

        list_ = []
        for elem in Y:
            for i in range(0, elem.shape[0]):
                if elem[i] == 1:
                    list_.append(i)
        list_ = np.asarray(list_)
        return list_

    # Load the preprocessed data and labels
    X_train_all = np.load(path + "/trainPreproc_" + str(img_rows) + "_" +
                          str(img_cols) + ".npy")
    Y_train_all = np.load(path + "/labelsPreproc.npy")

    X_train, X_val, Y_train, Y_val = \
        train_test_split(X_train_all, Y_train_all, test_size=0.25, stratify=np.argmax(Y_train_all, axis=1))

    print X_train.shape

    labels = convert_(Y_train)
    validation = convert_(Y_val)

    X_train = X_train.reshape(
        (X_train.shape[0], X_train.shape[2] * X_train.shape[3]))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[2] * X_val.shape[3]))

    print 'Training and Testing...'
    clf_rf = RandomForestClassifier()
    clf_rf.fit(X_train, labels)
    y_pred_rf = clf_rf.predict(X_val)
    acD_rf = accuracy_score(validation, y_pred_rf)
    print "random forest accuracy: ", acD_rf

    clf_sgd = SGDClassifier()
    clf_sgd.fit(X_train, labels)
    y_pred_sgd = clf_sgd.predict(X_val)
    acD_sgd = accuracy_score(validation, y_pred_sgd)
    print "stochastic gradient descent accuracy: ", acD_sgd

    clf_svm = LinearSVC()
    clf_svm.fit(X_train, labels)
    y_pred_svm = clf_svm.predict(X_val)
    acD_svm = accuracy_score(validation, y_pred_svm)
    print "Linear SVM accuracy: ", acD_svm

    clf_knn = KNeighborsClassifier()
    clf_knn.fit(X_train, labels)
    y_pred_knn = clf_knn.predict(X_val)
    acD_knn = accuracy_score(validation, y_pred_knn)
    print "nearest neighbors accuracy: ", acD_knn

    clf_nn = DBN([X_train.shape[1], 300, 62],
                 learn_rates=0.0240,
                 learn_rate_decays=0.9,
                 epochs=130)
    clf_nn.fit(X_train, labels)
    acD_nn = clf_nn.score(X_val, validation)
    print "neural network accuracy: ", acD_nn

    clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    clf.fit(X_train, labels)
    acD_nn = clf.score(X_val, validation)
    print "naive bayes: ", acD_nn

    clf = BernoulliNB(alpha=1.0,
                      binarize=0.0,
                      class_prior=None,
                      fit_prior=True)
    clf.fit(X_train, labels)
    acD_nn = clf.score(X_val, validation)
    print "bernulli naive bayes: ", acD_nn
def rotate_dataset(X):
    XX = np.zeros(X.shape)
    for index in range(X.shape[0]):
        angle = np.random.randint(-7, 7)
        XX[index, :] = nd.rotate(np.reshape(X[index, :], ((28, 28))),
                                 angle,
                                 reshape=False).ravel()
    return XX


# Load Data
mnist = pd.read_csv("data_5k.csv")
#mnist = mnist[:50]
y_train = mnist['label'].values
X_train = mnist.loc[:, 'pixel0':].values
X_test = pd.read_csv("test.csv").values
X_test = X_test[:2000]
X_train = np.asarray(X_train / 255.0, 'float32')
X_test = np.asarray(X_test / 255.0, 'float32')
#X_train, y_train = nudge_dataset(X_train, y_train)
#X_train = rotate_dataset(X_train)
clf = DBN([X_train.shape[1], 350, 10],\
  learn_rates=0.3,\
  learn_rate_decays=0.95,\
  learn_rates_pretrain=0.005,\
  epochs=120,\
  verbose=1)
clf.fit(X_train, y_train)
subm = pd.read_csv("rf_benchmark.csv")
subm.Label = clf.predict(X_test)
subm.to_csv("result.csv", index_label='ImageId', col=['Label'], index=False)
예제 #23
0
def train(args):
    start = time.time()
    for clfChoice in clfChoices:
        print("Loading embeddings.")
        fname = "{}/labels.csv".format(args.workDir)
        labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
        labels = map(itemgetter(1),
                     map(os.path.split, map(os.path.dirname,
                                            labels)))  # Get the directory.
        fname = "{}/reps.csv".format(args.workDir)
        embeddings = pd.read_csv(fname, header=None).as_matrix()
        le = LabelEncoder().fit(labels)
        labelsNum = le.transform(labels)
        nClasses = len(le.classes_)
        print("Training for {} classes.".format(nClasses))

        if clfChoice == 'LinearSvm':
            clf = SVC(C=1, kernel='linear', probability=True)
        elif clfChoice == 'GMM':  # Doesn't work best
            clf = GMM(n_components=nClasses)

        # ref:
        # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
        elif clfChoice == 'RadialSvm':  # Radial Basis Function kernel
            # works better with C = 1 and gamma = 2
            clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
        elif clfChoice == 'DecisionTree':  # Doesn't work best
            clf = DecisionTreeClassifier(max_depth=20)
        elif clfChoice == 'GaussianNB':
            clf = GaussianNB()

        # ref: https://jessesw.com/Deep-Learning/
        elif clfChoice == 'DBN':
            if args.verbose:
                verbose = 1
            else:
                verbose = 0
            clf = DBN(
                [embeddings.shape[1], 500, labelsNum[-1:][0] + 1
                 ],  # i/p nodes, hidden nodes, o/p nodes
                learn_rates=0.3,
                # Smaller steps mean a possibly more accurate result, but the
                # training will take longer
                learn_rate_decays=0.9,
                # a factor the initial learning rate will be multiplied by
                # after each iteration of the training
                epochs=300,  # no of iternation
                # dropouts = 0.25, # Express the percentage of nodes that
                # will be randomly dropped as a decimal.
                verbose=verbose)

        if args.ldaDim > 0:
            clf_final = clf
            clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                            ('clf', clf_final)])

        clf.fit(embeddings, labelsNum)

        fName = os.path.join(args.workDir, clfChoice + ".pkl")
        print("Saving classifier to '{}'".format(fName))
        with open(fName, 'w') as f:
            pickle.dump((le, clf), f)
    if args.verbose:
        print("Training and saving the classifiers took {} seconds.".format(
            time.time() - start))
예제 #24
0
def classifier(args,args_mode,dataset,sess):
    # Check that there are at least one training image per class
    for cls in dataset:
        #print(cls.name,'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
        if(len(cls.image_paths)<1):
            print(cls.image_paths,"@@@@@@@@@@@@@@@@@@@@@@@")
        assert (len(cls.image_paths) > 0, 'There must be at least one image for each class in the dataset')

    paths, labels,class_labels = get_image_paths_and_labels(dataset)

    print('Number of classes: %d' % len(dataset))
    print('Number of images: %d' % len(paths))



    # Get input and output tensors
    images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
    embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
    phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
    embedding_size = embeddings.get_shape()[1]

    # Run forward pass to calculate embeddings
    print('Calculating features for images')
    nrof_images = len(paths)
    nrof_batches_per_epoch = int(math.ceil(1.0 * nrof_images / args.batch_size))
    emb_array = np.zeros((nrof_images, embedding_size))
    for i in range(nrof_batches_per_epoch):
        start_index = i * args.batch_size
        end_index = min((i + 1) * args.batch_size, nrof_images)
        paths_batch = paths[start_index:end_index]
        images = facenet.load_data(paths_batch, False, False, args.image_size)
        feed_dict = {images_placeholder: images, phase_train_placeholder: False}
        emb_array[start_index:end_index, :] = sess.run(embeddings, feed_dict=feed_dict)

    classifier_filename_exp = os.path.expanduser(args.classifier_filename)

    if (args_mode == 'TRAIN'):
        # Train classifier
        print('Training classifier+++++++++++++++++++++++++',args.classifier)
        if args.classifier == 'LinearSvm':
            # clf = SVC(C=1, kernel='linear', probability=True)
            model = SVC(kernel='linear', probability=True)
        elif args.classifier == 'GridSearchSvm':
            print("""
                            Warning: In our experiences, using a grid search over SVM hyper-parameters only
                            gives marginally better performance than a linear SVM with C=1 and
                            is not worth the extra computations of performing a grid search.
                            """)
            param_grid = [
                {'C': [1, 10, 100, 1000],
                 'kernel': ['linear']},
                {'C': [1, 10, 100, 1000],
                 'gamma': [0.001, 0.0001],
                 'kernel': ['rbf']}
            ]
            model = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
        elif args.classifier == 'GMM':  # Doesn't work best
            model = GMM(n_components=nClasses)

        # ref:
        # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
        elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
            # works better with C = 1 and gamma = 2
            model = SVC(C=1, kernel='rbf', probability=True, gamma=2)
        elif args.classifier == 'DecisionTree':  # Doesn't work best
            model = DecisionTreeClassifier(max_depth=20)
        elif args.classifier == 'GaussianNB':
            model = GaussianNB()

        # ref: https://jessesw.com/Deep-Learning/
        elif args.classifier == 'DBN':
            from nolearn.dbn import DBN
            model = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],  # i/p nodes, hidden nodes, o/p nodes
                        learn_rates=0.3,
                        # Smaller steps mean a possibly more accurate result, but the
                        # training will take longer
                        learn_rate_decays=0.9,
                        # a factor the initial learning rate will be multiplied by
                        # after each iteration of the training
                        epochs=300,  # no of iternation
                        # dropouts = 0.25, # Express the percentage of nodes that
                        # will be randomly dropped as a decimal.
                        verbose=1)
        elif args.classifier == 'KNN':
            model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                                         metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                                         weights='uniform')

        model.fit(emb_array, labels)

        # Create a list of class names
        class_names = [cls.name.replace('_', ' ') for cls in dataset]

        # Saving classifier model
        with open(classifier_filename_exp, 'wb') as outfile:
            pickle.dump((model, class_names), outfile)
        print('Saved classifier model to file "%s"' % classifier_filename_exp)

    elif (args_mode == 'CLASSIFY'):
        # Classify images
        print('Testing classifier~~~~~~~~~~~~~~~~~~~~~~~~')
        with open(classifier_filename_exp, 'rb') as infile:
            (model, class_names) = pickle.load(infile)
        predictions = np.zeros((nrof_images, len(class_names)))
        print('Loaded classifier model from file "%s"' % classifier_filename_exp)
        correctPrediction = 0
        inCorrectPrediction = 0
        sumConfidence = 0.0
        correctConfidence = 0.0
        inCorrectConfidence = 0.0
        '''
         batch_size =args.batch_size
        #batch_size = 1
        for i in range(nrof_batches_per_epoch):
            start_index = i * batch_size
            end_index = min((i + 1) * batch_size, nrof_images)
            starttime = time.time()
            mini_emb_array = emb_array[start_index:end_index, :]
            predictions[start_index:end_index, :] = model.predict_proba(mini_emb_array)
            print("start_index:{} end_index:{} time:{}".format(start_index, end_index, time.time() - starttime))
      
        '''
        predictions = model.predict_proba(emb_array)
        best_class_indices = np.argmax(predictions, axis=1)
        best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices]
        results = {'name': [], 'bestname': [], 'probabilities': []}
        for i in range(len(best_class_indices)):
            #print(len(class_names))
            #print(i,len(labels),labels[i])
            #print(i,len(best_class_indices),best_class_indices[i])
            print('%4d  %s:%s: %.3f' % (
            i, class_labels[i], class_names[best_class_indices[i]], best_class_probabilities[i]))
            results['name'].append(class_labels[i])
            results['bestname'].append(class_names[best_class_indices[i]])
            results['probabilities'].append(best_class_probabilities[i])
            sumConfidence += best_class_probabilities[i]
            if (class_labels[i] == class_names[best_class_indices[i]]):
                correctPrediction += 1
                correctConfidence += best_class_probabilities[i]
            else:
                inCorrectPrediction += 1
                inCorrectConfidence += best_class_probabilities[i]

        #accuracy = np.mean(np.equal(best_class_indices, labels))
        accuracy = float(correctPrediction) / (correctPrediction + inCorrectPrediction)
        Avg_Confidence = float(sumConfidence) / (correctPrediction + inCorrectPrediction)
        Avg_correctConfidence = float(correctConfidence/correctPrediction)
        Avg_inCorrectConfidence = float(inCorrectConfidence / inCorrectPrediction)
        results['name'].append('Accuracy:')
        results['bestname'].append('Accuracy:')
        results['probabilities'].append(accuracy)
        dataname = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
        data_frame = pd.DataFrame(
            data={'name': results['name'], 'bestname': results['bestname'], 'probabilities': results['probabilities']})
        data_frame.to_csv(args.data_dir + '/results_' + dataname + '.csv')

        print("Correct Prediction :" + str(correctPrediction))
        print("In-correct Prediction: " + str(inCorrectPrediction))
        print('Accuracy: %.3f' % accuracy)
        print("Avg Confidence: " + str(Avg_Confidence))
        print("Avg CorrectConfidence: " + str(Avg_correctConfidence))
        print("Avg inCorrectConfidence: " + str(Avg_inCorrectConfidence))
예제 #25
0
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import datasets
from nolearn.dbn import DBN
import numpy as np

print("[INFO] downloading mnist")

dataset = datasets.fetch_mldata("MNIST Original")

(trainData, testData, trainLabels,
 testLabels) = train_test_split(dataset.data / 255.0,
                                dataset.target.astype("int"),
                                test_size=0.33)

dbn = DBN([trainData.shape[1], 300, 10],
          learn_rates=0.3,
          learn_rate_decays=0.9,
          epochs=10,
          verbose=1)
dbn.fit(trainData, trainLabels)

predictions = dbn.predict(testData)
print(classification_report(testLabels, predictions))
예제 #26
0
    print train_inputs.shape
    print test_inputs.shape
    sys.exit(0)

X = joblib.load('features/X_train_processed.pkl')
Y = np.load('blobs/Y_train.npy')
print("done loading")

CREATE_SUBMISSION = False
if CREATE_SUBMISSION:
    X_submit = joblib.load('features/X_submit_processed.pkl')
    X = select.transform(X)
    X_submit = select.transform(X_submit)
    clf = DBN([X.shape[1], 280, 10],
              learn_rates=.1,
              learn_rate_decays=0.9,
              momentum=0.9,
              epochs=100,
              verbose=1)
    clf.fit(X, Y)
    print("done fit")
    pred = clf.predict(X_submit)
    f = open('preds_nn_2304x280x10_100epochs_processed.csv', 'wb')
    f.write('Id,Prediction\n')
    for i, p in enumerate(pred):
        f.write("%d,%d\n" % (i + 1, p))
    f.close()
else:
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.1,
                                                        random_state=42)
예제 #27
0
from sklearn.metrics import classification_report

mnist = fetch_mldata('mnist-original')
X_train, X_test, y_train, y_test = train_test_split(
    (mnist.data / 255.0).astype(np.float32),
    mnist.target.astype(np.int32),
    test_size=1.0 / 7.0,
    random_state=1234)

classifiers = []

if 'dbn' in sys.argv:
    from nolearn.dbn import DBN
    clf = DBN([X_train.shape[1], 300, 10],
              learn_rates=0.3,
              learn_rate_decays=0.9,
              epochs=10,
              verbose=1)
    classifiers.append(('nolearn.dbn', clf))

if 'sknn' in sys.argv:
    from sknn.mlp import Classifier, Layer, Convolution

    clf = Classifier(
        layers=[
            # Convolution("Rectifier", channels=10, pool_shape=(2,2), kernel_shape=(3, 3)),
            Layer('Rectifier', units=200),
            Layer('Softmax')
        ],
        learning_rate=0.01,
        learning_rule='momentum',
예제 #28
0
    for pipe in [pipeline_3]:
        model_generic_1ofK_clas(pipe,
                                model_name="rf_1",
                                model_f=sklearn_prob_model(
                                    RandomForestClassifier(n_estimators=200,
                                                           n_jobs=6)),
                                essays_paths=ALL_ESSAYS,
                                parallel=False)

    for pipe in [pipeline_8]:
        model_generic_DBN(pipeline=pipe,
                          model_name="nn_3",
                          model_f=DBN([-1, 800, -1],
                                      learn_rates=0.1,
                                      learn_rate_decays=0.9,
                                      use_re_lu=True,
                                      epochs=200,
                                      verbose=0,
                                      dropouts=[0.5, 0.1, 0.0],
                                      momentum=0.9),
                          essays_paths=ALL_ESSAYS)

    for pipe in PIPELINES:
        for ntrees in [50, 100, 200, 300]:
            xgbparam = {
                'max_depth': 6,
                'eta': 0.1,
                'silent': 1,
                'objective': 'binary:logistic',
                'nthread': 6
            }
            xgbparamser = serialize_dict(xgbparam) + "_ntrees=%d" % (ntrees)
예제 #29
0
    def train(self, impostor, negative_samples):
        """
        impostor: just string rep of the impostor
        negative_samples: Set of impostor examples - which we label 0. This does not include
        examples from impostor.
        Training examples remain the same.
        
        At the end, will update self.dbns and add a new trained dbn for this
        impostor.
        """
        train_X = np.copy(self.train_X)
        samples = np.vstack((train_X, negative_samples))
        labels = []
        # Add labels: 1 - user, and 0 - impostor.
        for i in self.train_X:
            labels.append(1)
        for i in negative_samples:
            labels.append(0)

        labels = np.array(labels)

        # print("details of train_X AFTER NORM...")
        # print("len trainX is = ", len(samples))
        # print("train X[0] is ", samples[0])
        # print("train X is ", samples)

        if self.normalize:
            sample = samples[0][1]
            self._normalize(samples)
            assert samples[0][1] != sample, "normalization not done"

        # Nothing disastrous appears to have happened before/after...
        # print("details of train_X before...")
        # print("len trainX is = ", len(samples))
        # print("train X[0] is ", samples[0])
        # print("train X is ", samples)

        # FIXME: Adds some randomness into the training process - should we
        # avoid this?
        # samples, labels = unison_shuffled_copies(samples, labels)

        file_name = self.gen_pickle_name(impostor, str(len(negative_samples)))

        # Training is expensive so let us save and load it if possible.

        # FIXME Update this - just want to make sure we are training new guy
        # everytime for now...
        if os.path.isfile(file_name):
            # if False:
            with open(file_name, 'rb') as handle:
                self.dbns[impostor] = pickle.load(handle)
                print("loaded file ", file_name, "from disk")

        else:
            # let us train this guy and then save it.
            self.dbns[impostor] = DBN(
                self.shape,
                learn_rates=self.learn_rates,
                epochs=self.epochs,
                learn_rates_pretrain=self.learn_rates_pretrain,
                epochs_pretrain=self.epochs_pretrain,
                learn_rate_decays=self.learn_rate_decays,
                verbose=self.verbose,
                minibatches_per_epoch=self.minibatches_per_epoch,
                loss_funct=self.loss_funct)

            self.dbns[impostor].fit(samples, labels)

            with open(file_name, 'w+') as handle:
                pickle.dump(self.dbns[impostor],
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
                print("saved ", file_name, "on disk")
trainX = dataset['data'].T[0:500, 0:] / 255.0
trainY = dataset['label'][0][0:20000]

testX = dataset['data'].T[0:, 0:] / 255.0
testY = dataset['label'][0][0:]
# train the Deep Belief Network with 784 input units (the flattened,
# 28x28 grayscale image), 300 hidden units, 10 output units (one for
# each possible output classification, which are the digits 1-10)
print trainX.shape, trainY.shape
# trainY = np.array(range(2000))
# print type(trainY), trainY
# exit()
print "shape[0]", trainX.shape[1]
dbn = DBN([trainX.shape[1], 500, 3],
          learn_rates=0.1,
          learn_rate_decays=0.9,
          epochs=2500,
          verbose=1)
dbn.fit(trainX, trainY)
with open('data.pkl', 'wb') as output:
    pickle.dump(dbn, output, pickle.HIGHEST_PROTOCOL)

# # compute the predictions for the test data and show a classification
# # report
# preds = dbn.predict(testX)
# print classification_report(testY, preds)

# randomly select a few of the test instances
# for i in np.random.choice(np.arange(0, len(testY)), size = (500,)):
# 	# classify the digit
# 	print testX[i].shape
예제 #31
0
    def train(self, workDir, classifier='LinearSvm'):

        print("Loading embeddings.")

        fname = "{}/labels.csv".format(workDir)

        labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
        labels = map(itemgetter(1),
                     map(os.path.split, map(os.path.dirname, labels)))

        fname = "{}/reps.csv".format(workDir)

        embeddings = pd.read_csv(fname, header=None).as_matrix()

        le = LabelEncoder().fit(labels)
        labelsNum = le.transform(labels)
        nClasses = len(le.classes_)

        print("Training for {} classes.".format(nClasses))

        if classifier == 'LinearSvm':

            clf = SVC(C=1, kernel='linear', probability=True)

        elif classifier == 'GMM':

            clf = GMM(n_components=nClasses)

        elif classifier == 'RadialSvm':

            clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)

        elif classifier == 'DecisionTree':

            clf = DecisionTreeClassifier(max_depth=20)

        elif classifier == 'GaussianNB':

            clf = GaussianNB()

        elif classifier == 'DBN':

            from nolearn.dbn import DBN

            clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],
                      learn_rates=0.3,
                      learn_rate_decays=0.9,
                      epochs=300,
                      verbose=1)

        if self.ldaDim > 0:

            clf_final = clf
            clf = Pipeline([('lda', LDA(n_components=self.ldaDim)),
                            ('clf', clf_final)])

        clf.fit(embeddings, labelsNum)

        fName = "{}/classifier.pkl".format(workDir)

        print("Saving classifier to '{}'".format(fName))

        with open(fName, 'w') as f:

            pickle.dump((le, clf), f)
예제 #32
0
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('mnist-original')
X_train, X_test, y_train, y_test = train_test_split(
    (mnist.data / 255.0).astype(np.float32),
    mnist.target.astype(np.int32),
    test_size=0.33,
    random_state=1234)

classifiers = []

if 'dbn' in sys.argv:
    from nolearn.dbn import DBN
    clf = DBN([X_train.shape[1], 300, 10],
              learn_rates=0.3,
              learn_rate_decays=0.9,
              epochs=10,
              verbose=1)
    classifiers.append(('nolearn.dbn', clf))

if 'sknn' in sys.argv:
    from sknn import mlp

    clf = mlp.Classifier(
        layers=[mlp.Layer("Rectifier", units=300),
                mlp.Layer("Softmax")],
        learning_rate=0.02,
        learning_rule='momentum',
        batch_size=25,
        valid_size=0.0,
        n_stable=10,
예제 #33
0
파일: main.py 프로젝트: AngeloK/cs584-hws
        # duration = end - start
        # print "Executing time is %s" %str(duration)
        # print m.score(X_test, y_test)
        # print "\n"

        # print X_train[:2, :]
        # print y_train[:2]

        # m = MLP()
        # m.fit(X_train, y_train, 0.0001, 500)

        # p = m.predict(X_test)
        # print classification_report(y_test, p)


        # h = [i for i in range(1,1000, 200)]

        dbn = DBN(
            [X_train.shape[1], 200, 3],
            learn_rates = 0.0001,
            learn_rate_decays = 0.9,
            epochs = 10,
            verbose = 1
        )
        dbn.fit(X_train, y_train)
        p = dbn.predict(X_test)
        a = accuracy_score(y_test, p)
        eva = classification_report(y_test, p)
        print "Accuracy = %.2f" %a
        print eva
예제 #34
0
train = poly.fit_transform(train)
test = poly.transform(test)
#train = np.hstack((train, poly_train))
#test = np.hstack((test, poly_test))

# encode labels
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# set up datasets for cross eval
x_train, x_test, y_train, y_test = train_test_split(train, labels)

# train a DBN classifier
clf = DBN([train.shape[1], 8000, 9],
          learn_rates=0.3,
          learn_rate_decays=0.9,
          epochs=50,
          verbose=1)  # l2_costs = 0.0001,

clf.fit(x_train, y_train)

# predict on test set
preds = clf.predict_proba(x_test)

# ----------------------  create submission file  -----------------------------
#preds = pd.DataFrame(preds, index=sample.id.values, columns=sample.columns[1:])
#preds.to_csv('Preds/dbn_amax.csv', index_label='id')

# ----------------------  cross eval  -----------------------------------------

#y_test = label_binary.inverse_transform(y_test)
예제 #35
0
digits_label = data[:, 0]
digits_image = data[:, 1:]

# digits_label = np.array([i[0] for i in data])
# digits_image = np.array([i[1:]/255.0 for i in data])

digits_image = map(binaryzation, digits_image)

print("file loading ok , prepare to train model")

# object of ANN model.

clf = DBN(
    [digits_image.shape[1], 500, 500, 10],
    learn_rates=0.01,
    learn_rate_decays=0.9,
    epochs=20,
    verbose=1,
)

# gnb.fit(digits_image[:-testRest],digits_label[:-testRest])
clf.fit(digits_image[:-testRest], digits_label[:-testRest])

#result = gnb.predict(digits_image[-testRest:])
result = clf.predict(digits_image[-testRest:])

print("the err rate : %.2f %%" %
      (100 * ((result != digits_label[-testRest:]).sum()) / float(testRest)))

print("the result num %.2f " % (result != digits_label[-testRest:]).sum())
예제 #36
0
# normalization (recommendable)
trainFeats -= trainFeats.min()
trainFeats /= trainFeats.max()

# Loading test data
print "loading test features and labels ..."
testFeats = pd.read_csv(testFile, delim_whitespace=True, header=None)
testFeats = testFeats.as_matrix()
testLabels = pd.read_csv(testLabelsFile, header=None).values.ravel()

# normalization (recommendable)
testFeats -= testFeats.min()
testFeats /= testFeats.max()

dbn = DBN(
    [trainFeats.shape[1], 2000, 7],
    learn_rates=0.1,
    learn_rate_decays=0.9,
    epochs=10,
    verbose=1)
dbn.fit(trainFeats, trainLabels)
predictedTest = dbn.predict(testFeats)

# Print classification report
print(classification_report(testLabels, predictedTest))

# Print confusion matrix
print(confusion_matrix(testLabels, predictedTest))

# generate polynomial features
poly = PolynomialFeatures()
train = poly.fit_transform(train)
test = poly.transform(test)
#train = np.hstack((train, poly_train))
#test = np.hstack((test, poly_test))

# encode labels
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# set up datasets for cross eval
x_train, x_test, y_train, y_test = train_test_split(train, labels)

# train a DBN classifier
clf = DBN([train.shape[1], 8000, 9], learn_rates = 0.3,
            learn_rate_decays = 0.9, epochs = 50, verbose = 1) # l2_costs = 0.0001,

clf.fit(x_train, y_train)

# predict on test set
preds = clf.predict_proba(x_test)

# ----------------------  cross eval  -----------------------------------------

#y_test = label_binary.inverse_transform(y_test)
#y_test = LabelEncoder().fit_transform(y_test)

print("Multiclass Log Loss: ", MultiLogLoss(y_test, preds))
예제 #38
0
import math
import numpy as np
from nolearn.dbn import DBN
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD, PCA, RandomizedPCA
from sklearn import metrics

X = np.load('../blobs/X_train.npy')
Y = np.load('../blobs/Y_train.npy')
print 'done loading'

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 2304, 4608, 1152, 576, 10
# 15-20 epochs
clf = DBN([X_train.shape[1], X_train.shape[1]*2, X_train.shape[1]//2, X_train.shape[1]//4, 10], learn_rates=0.01, learn_rate_decays=0.9, momentum=0.9, epochs=10, verbose=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)

score = metrics.f1_score(Y_test, pred)
print("f1-score:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(Y_test, pred))

print("confusion matrix:")
print(metrics.confusion_matrix(Y_test, pred))
예제 #39
0

mnist = fetch_mldata('mnist-original')
X_train, X_test, y_train, y_test = train_test_split(
        (mnist.data / 255.0).astype(np.float32),
        mnist.target.astype(np.int32),
        test_size=1.0/7.0, random_state=1234)


classifiers = []

if 'dbn' in sys.argv:
    from nolearn.dbn import DBN
    clf = DBN(
        [X_train.shape[1], 300, 10],
        learn_rates=0.3,
        learn_rate_decays=0.9,
        epochs=10,
        verbose=1)
    classifiers.append(('nolearn.dbn', clf))

if 'sknn' in sys.argv:
    from sknn.mlp import Classifier, Layer, Convolution

    clf = Classifier(
        layers=[
            # Convolution("Rectifier", channels=10, pool_shape=(2,2), kernel_shape=(3, 3)),
            Layer('Rectifier', units=200),
            Layer('Softmax')],
        learning_rate=0.02,
        learning_rule='momentum',
        learning_momentum=0.9,
예제 #40
0
################
#Neural network#
################
from LoadData import *
from nolearn.dbn import DBN
train_data = load_train_data()
test_data = load_test_data()

print(train_data)
exit(0)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

clf = DBN([train_data[:, 1:].shape[1], 300, 10],
          learn_rates=0.3,
          learn_rate_decays=0.9,
          epochs=15)
scores = cross_val_score(clf, train_data[:, 1:], train_data[:, 0])
clf.fit(train_data[:, 1:], train_data[:, 0])
y_pred = clf.predict(test_data)

#from sklearn.externals import joblib
#joblib.dump(clf, 'NeuralNetork.pkl')

import csv
with open('submision.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, quotechar=',', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['ImageId'] + ['Label'])

    for i in range(len(y_pred)):
        spamwriter.writerow([i + 1] + [y_pred[i]])
예제 #41
0
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST original")
X_train, X_test, y_train, y_test = train_test_split(mnist.data / 255.0, mnist.target)

from nolearn.dbn import DBN

clf = DBN([X_train.shape[1], 300, 10], learn_rates=0.3, learn_rate_decays=0.9, epochs=10, verbose=1)

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss

y_pred = clf.predict(X_test)
print "Accuracy:", 1 - zero_one_loss(y_test, y_pred)
print "Classification report:"
print classification_report(y_test, y_pred)

####################################

if __name__=='__main__':
    dbn_list=[]
    for i in range(2,8):
        dat,lab=db_load(tup(i),i)
        try:
            dbn = joblib.load("pickles/dbn_"+str(tup(i))+"x"+str(i)+".pkl") 
            dbn_list.append(dbn)
        except:
            dbn = DBN(
                [i*tup(i), 400, 10],
                learn_rates = 0.3,
                learn_rate_decays = 0.9,
                epochs = 50,
                verbose = 1
                )
            dbn.fit(dat,lab)
            dbn_list.append(dbn)
            joblib.dump(dbn,"pickles/dbn_"+str(tup(i))+"x"+str(i)+".pkl")
        finally:
            #print dat.shape
            #print lab.shape
            print dbn_list.__len__()
            print ("trained ! ready to predict!")
            #print "training report for {}x{}:".format(tup(i),i)
            tes,labt=test_load(tup(i),i)
            preds=dbn.predict(tes)
            sampleClassificationReport=classification_report(labt,preds)
예제 #43
0
파일: dbn_min.py 프로젝트: ranyu/LMOptima
train_x = numpy.load('data/X_train.npy')
test_x = numpy.load('data/X_test.npy')
train_y = numpy.load('data/Y_train.npy')

print train_x.shape
print train_y.shape
print test_x.shape

(train_x,vali_x,train_y,vali_y) = train_test_split(train_x,train_y,test_size = 0.2)

dbn = DBN(
        [300,1024,120000],
        learn_rates = 0.025,
        learn_rate_decays = 0.98,
        l2_costs = 0.0001,
        minibatch_size=256,
        epochs=5,
        momentum = 0.9,
        #dropouts=0.22,
        verbose = 2)

dbn.fit(train_x, train_y)
print 'validation score is:' ,dbn.score(vali_x,vali_y)

result = dbn.predict(test_x)
with open('data/result','w') as f:
    for el in result:
        f.write(el+'\n')

#predicted_y_proba = dbn.predict_proba(test_x)
예제 #44
0
    <START TRAINING>
    - Referred from : http://goo.gl/GBYZvR
"""
# Collect Data
# dataset = unpickle('data_batch_1')
train['data'] = np.asarray(train['data'])
data_train, labels_train = cvt_tastable_set(train)
data_train = data_train.astype('float') / 255.
labels_train = labels_train

# Training Data
n_feat = data_train.shape[1]
n_targets = labels_train.max() + 1
net = DBN(
    [n_feat, n_feat / 3, n_targets],
    epochs=20,
    learn_rates=0.02,
    verbose=1
)
net.fit(data_train, labels_train)
# f = file('Deeplearned.save', 'wb')
# cPickle.dump(net, f, protocol=cPickle.HIGHEST_PROTOCOL)
# f.close()

# Test set generation
# image_list = []
# label_index = []
# data_list2 = {'labels': label_index, 'data': image_list}
# test = collect_images(
#     dict=data_list2,
#     dir_path='./yeragoData/testSet/right_eyes',
#     label_addition=True,
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1),
                 map(os.path.split,
                     map(os.path.dirname, labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [
            {'C': [1, 10, 100, 1000],
             'kernel': ['linear']},
            {'C': [1, 10, 100, 1000],
             'gamma': [0.001, 0.0001],
             'kernel': ['rbf']}
        ]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],  # i/p nodes, hidden nodes, o/p nodes
                  learn_rates=0.3,
                  # Smaller steps mean a possibly more accurate result, but the
                  # training will take longer
                  learn_rate_decays=0.9,
                  # a factor the initial learning rate will be multiplied by
                  # after each iteration of the training
                  epochs=300,  # no of iternation
                  # dropouts = 0.25, # Express the percentage of nodes that
                  # will be randomly dropped as a decimal.
                  verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
예제 #46
0
        "Label": predictions
    }
    predictions_table = pd.DataFrame(predictions_dict)
    predictions_table.to_csv(csv_path, index=False)


X_train, Y_train = load_training_data()

X_train, Y_train = rotate_dataset(X_train, Y_train)
#X_train, Y_train = nudge_dataset(X_train, Y_train)

n_features = X_train.shape[1]
n_classes = 10
classifier = DBN([n_features, 10, n_classes],
                 learn_rates=0.01,
                 learn_rate_decays=0.9,
                 epochs=1,
                 verbose=1)

classifier.fit(X_train, Y_train)

test_data = Z
predictions = classifier.predict(test_data)
csv_path = make_predictions_path()
write_predictions_to_csv(predictions)


def __main__(args):
    run()

    print "Loading data..."
    X_train, y_train = load_data("../dataset/%s" % TRAIN_DATA)

    # Split data to train and test
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=TEST_SIZE, random_state=0)
    X_train = X_train.todense()
    X_test = X_test.todense()


    # Train --------------------------------------------------------------
    print "Training..."
    t1 = datetime.now()

    dbn = DBN(
        [-1, 300, 300, -1],
        learn_rates=0.1,
        learn_rate_decays=0.9,
        epochs=10,
        verbose=1)
    dbn.fit(X_train, y_train)


    print "Training %f secs" % (datetime.now() - t1).total_seconds()

    if TEST_SIZE > 0:
        tlabel = dbn.predict(X_test)
        print 'Error: %f' % error_track_0(tlabel, y_test)

    if DUMP:
        # Dump model --------------------------------------------------------------
        print "Dumping model..."
        joblib.dump(dbn, '../model/deep/%s.pkl' % MODEL_NAME)
예제 #48
0
class DBNRegressor(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        n_hidden_layers=2,
        n_units=1000,
        epochs=100,
        epochs_pretrain=0,
        scales=0.05,
        real_valued_vis=True,
        use_re_lu=False,
        uniforms=False,
        learn_rates_pretrain=0.1,
        learn_rates=0.1,
        learn_rate_decays=1.0,
        learn_rate_minimums=0.0,
        momentum=0.9,
        momentum_pretrain=0.9,
        l2_costs=0.0001,
        l2_costs_pretrain=0.0001,
        dropouts=None,
        minibatch_size=64,
        verbose=2,
        fine_tune_callback=None,
        nest_compare=True,
        nest_compare_pretrain=None,
        fan_outs=None,
        nesterov=False,
    ):
        self.n_hidden_layers = n_hidden_layers
        self.n_units = n_units
        self.epochs = epochs
        self.epochs_pretrain = epochs_pretrain
        self.learn_rates_pretrain = learn_rates_pretrain
        self.learn_rates = learn_rates
        self.learn_rate_decays = learn_rate_decays
        self.learn_rate_minimums = learn_rate_minimums
        self.l2_costs_pretrain = l2_costs_pretrain
        self.l2_costs = l2_costs
        self.momentum = momentum
        self.momentum_pretrain = momentum_pretrain
        self.verbose = verbose
        self.real_valued_vis = real_valued_vis
        self.use_re_lu = use_re_lu
        self.scales = scales
        self.minibatch_size = minibatch_size
        if dropouts is None:
            dropouts = [0.2] + [0.5] * n_hidden_layers
        self.dropouts = dropouts
        self.fine_tune_callback = fine_tune_callback
        self.nest_compare = nest_compare
        self.nest_compare_pretrain = nest_compare_pretrain
        self.fan_outs = fan_outs
        self.nesterov = nesterov

    def fit(self, X, y, X_pretrain=None):
        from nolearn.dbn import DBN

        if y.ndim == 2:
            n_outputs = y.shape[1]
        else:
            y = y[:, np.newaxis]
            n_outputs = 1

        params = dict(self.__dict__)
        from gdbn.activationFunctions import Linear
        params['output_act_funct'] = Linear()

        n_units = params.pop('n_units')
        n_hidden_layers = params.pop('n_hidden_layers')
        if isinstance(n_units, int):
            units = [n_units] * n_hidden_layers
        else:
            units = n_units
        units = [X.shape[1]] + units + [n_outputs]
        self.dbn = DBN(units, **params)
        print X.shape
        self.dbn.fit(X, y, X_pretrain=X_pretrain)

    def predict(self, X):
        return self.dbn.chunked_decision_function(X)
예제 #49
0
"""
Train the network with 3200 inputs (64x50 values in file)
6 output units (for the different defects)
Lets start with 2 hidden units
"""

# Numbers from example used here
(trainX, testX, trainY, testY) = train_test_split(
    dataset / 255.0, dataset.target.astype("int0"), test_size=0.33)

dbn = DBN(
    # [[numNodes input layer], numNodes hidden layer, numNodes output layer ]
    [trainX.shape[1], 2, 6],
    # Learning rate of algorithm
    learn_rates=0.3,
    # Decay of learn rate
    learn_rate_decays=0.9,
    # Iterations of training data (epochs)
    epochs=10,
    # Verbosity level
    verbose=1)
dbn.fit(trainX,trainY) 

print "trained yo!"

# Evaluate network
#-----------------
preds = dbn.predict(testX)
print classification_report(testY, preds) # Table of accuracies 

예제 #50
0

X_train, X_test, y_train, y_test = cross_validation.train_test_split(train,train_labels,test_size=0.2,random_state=0)

print "Applying a learning algorithm..."


from nolearn.dbn import DBN
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)
clf = DBN(
    [X_train.shape[1], 300, 10],
    learn_rates=0.3,
    learn_rate_decays=0.9,
    epochs=15,
    verbose=1,
    )

clf.fit(X_train, y_train)
acc_nn = clf.score(X_test,y_test)
print "neural network accuracy: ",acc_nn


y_pred = clf.predict(X_test)
print "Classification report:"
print classification_report(y_test, y_pred)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test,y_pred))

예제 #51
0
# load the data, scale the data to the range [0, 1] and then construct
# the training and testing splits
dataset = io.loadmat("mnist-original.mat")
data = dataset["data"].T
labels = dataset["label"].flatten().astype("int")
(trainX, testX, trainY, testY) = train_test_split(data / 255.0,
                                                  labels,
                                                  test_size=0.33)

# train the Deep Belief Network with 784 input units (the flattened,
# 28x28 grayscale image), 300 hidden units, 10 output units (one for
# each possible output classification, which are the digits 1-10)
dbn = DBN([trainX.shape[1], 300, 10],
          learn_rates=0.3,
          learn_rate_decays=0.9,
          epochs=10,
          verbose=1)
dbn.fit(trainX, trainY)

# compute the predictions for the test data and show a classification
# report
preds = dbn.predict(testX)
print classification_report(testY, preds)

# randomly select a few of the test instances
for i in np.random.choice(np.arange(0, len(testY)), size=(10, )):
    # classify the digit
    pred = dbn.predict(np.atleast_2d(testX[i]))

    # reshape the feature vector to be a 28x28 pixel image, then change
예제 #52
0
파일: mnist.py 프로젝트: syamms/PhDCode
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from nolearn.dbn import DBN
import timeit

train = pd.read_csv("mnist_train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X/255., y, test_size = 0.1, random_state = 0)

clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)

y_pred_rf = clf_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

"nn"

clf_nn = DBN([X_train.shape[1], 300, 10], learn_rates=0.3, learn_rate_decays=0.9, epochs=50)

clf_nn.fit(X_train, y_train)
acc_nn = clf_nn.score(X_test, y_test)


예제 #53
0
# this script, this make take a minute -- the 55mb MNIST digit dataset
# will be downloaded)
print "[X] downloading data..."
dataset = datasets.fetch_mldata("MNIST Original")

# scale the data to the range [0, 1] and then construct the training
# and testing splits
(trainX, testX, trainY, testY) = train_test_split(
	dataset.data / 255.0, dataset.target.astype("int0"), test_size = 0.33)

# train the Deep Belief Network with 784 input units (the flattened,
# 28x28 grayscale image), 300 hidden units, 10 output units (one for
# each possible output classification, which are the digits 1-10)
dbn = DBN(
	[trainX.shape[1], 300, 10],
	learn_rates = 0.3,
	learn_rate_decays = 0.9,
	epochs = 10,
	verbose = 1)
dbn.fit(trainX, trainY)

# compute the predictions for the test data and show a classification
# report
preds = dbn.predict(testX)
print classification_report(testY, preds)

# randomly select a few of the test instances
for i in np.random.choice(np.arange(0, len(testY)), size = (10,)):
	# classify the digit
	pred = dbn.predict(np.atleast_2d(testX[i]))

	# reshape the feature vector to be a 28x28 pixel image, then change
예제 #54
0
            orientations=9,
            pixels_per_cell=(4, 4),
            cells_per_block=(2, 2),
            visualise=False)
    hogFeatures_test.append(f)
hogFeatures_test_np = np.array(hogFeatures_test)
print "hog features test dimensions", hogFeatures_test_np.shape

print "init len ", len(train_x_main)
for i in range(10):
    m = int(len(train_x_main) * (i + 1.0) / 10)
    train_x = train_x_main[0:m]
    print "len at ", i, " iteration : ", len(train_x)
    train_y = train_y_main[0:m]
    clf_nn = DBN([train_x.shape[1], 300, 10],
                 learn_rates=0.3,
                 learn_rate_decays=0.9,
                 epochs=15)
    clf_nn.fit(train_x, train_y)
    acc_nn = clf_nn.score(test_x, test_y)
    arr_acc.append(acc_nn)
    hogFeatures_train = []
    for feature in train_x:
        f = hog(feature.reshape((28, 28)),
                orientations=9,
                pixels_per_cell=(4, 4),
                cells_per_block=(2, 2),
                visualise=False)
        hogFeatures_train.append(f)
    hogFeatures_train_np = np.array(hogFeatures_train)
    print "hog features train dimensions", hogFeatures_train_np.shape
    print "length of hog train np ", len(hogFeatures_train)