示例#1
0
def train_svm(experimental_c, experimental_gamma):
    train = np.load('train_vars.npy')
    val = np.load('val_vars.npy')
    train_labels = np.load('train_labels.npy').ravel()
    val_labels = np.load('val_labels.npy').ravel()
    val_size = len(val_labels)
    svm_model = svm(C=experimental_c, gamma=experimental_gamma)
    print 'Train y shape = %s' % (train_labels.shape, )
    print 'Train X shape = %s' % (train.shape, )
    svm_model.fit(train, train_labels)
    predictions = svm_model.predict(val)

    print 'Val y shape = %s' % (val_labels.shape, )
    print 'Predictions shape = %s' % (predictions.shape, )
    correct = np.sum(np.equal(predictions, val_labels))
    accuracy = correct / float(val_size)
    result = 1 - accuracy
    print 'Number of correct predictions: %f' % correct
    print 'Fraction of correct predictions: %f' % accuracy
    print 'Error rate: %f' % result
    print 'Number of labels %f' % len(val_labels)
    result = float(result)

    print 'Result = %f' % result
    #time.sleep(np.random.randint(60))
    return result
示例#2
0
def main(job_id, params):
    cSVM = svm(C = 10.**params['C'], gamma = 10.**params['gamma'])
    train = np.load("train.npy")
    val = np.load("val.npy")
    trainLabel = np.load("trainLabel.npy")
    valLabel = np.load("valLabel.npy")
    cSVM.fit(train, trainLabel)
    preds = cSVM.predict(val)
    return -np.sum(np.equal(preds,valLabel))*1./len(valLabel)
示例#3
0
def test(relativePath, params):
    cSVM = svm(C=10.**params['C'], gamma=10.**params['gamma'])
    train = np.load(relativePath + "train.npy")
    test = np.load(relativePath + "test.npy")
    trainLabel = np.load(relativePath + "trainLabel.npy")
    testLabel = np.load(relativePath + "testLabel.npy")
    cSVM.fit(train, trainLabel)
    preds = cSVM.predict(test)
    return -np.sum(np.equal(preds, testLabel)) * 1. / len(testLabel)
示例#4
0
def main(job_id, params):
    cSVM = svm(C=10.**params['C'], gamma=10.**params['gamma'])
    train = np.load("train.npy")
    val = np.load("val.npy")
    trainLabel = np.load("trainLabel.npy")
    valLabel = np.load("valLabel.npy")
    cSVM.fit(train, trainLabel)
    preds = cSVM.predict(val)
    return -np.sum(np.equal(preds, valLabel)) * 1. / len(valLabel)
示例#5
0
def test(relativePath, params):
    cSVM = svm(C = 10.**params['C'], gamma = 10.**params['gamma'])
    train = np.load(relativePath + "train.npy")
    test = np.load(relativePath + "test.npy")
    trainLabel = np.load(relativePath + "trainLabel.npy")
    testLabel = np.load(relativePath + "testLabel.npy")
    cSVM.fit(train, trainLabel)
    preds = cSVM.predict(test)
    return -np.sum(np.equal(preds,testLabel))*1./len(testLabel)
示例#6
0
    def create_model(self, model_type, parameters):

        if model_type == 'lr':
            model = lr()
        elif model_type == 'svm':
            model = svm()
        elif model_type == 'mlp':
            model = mlp()
        elif model_type == 'rf':
            model = rf()
        elif model_type == 'xgb':
            model = xgb()
        return model.set_params(**parameters)
示例#7
0
 def add_model(self, model_type):
     if model_type == 'lr':
         self.models.append((model_type, lr(normalize=True)))
     elif model_type == 'ridge':
         self.models.append((model_type, rc(normalize=True, cv=None)))
     elif model_type == 'lasso':
         self.models.append((model_type, la(normalize=True)))
     elif model_type == 'svm':
         self.models.append((model_type, svm()))
         self.param_grid['svm'] = {
             'kernel': ['rbf'],
             'C': range(10, 100, 10),
             'epsilon': [0.01]
         }
     elif model_type == 'mlp':
         self.models.append((model_type, mlp()))
         self.param_grid['mlp'] = {
             'hidden_layer_sizes': [(16, 16, 16, 16, 16), (16, 16, 16, 16)],
             'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'solver': ['lbfgs', 'adam'],
             'alpha': [0.001, 0.01],
             'learning_rate': ['constant', 'invscaling', 'adaptive'],
             'learning_rate_init': [0.001, 0.01, 0.1],
             #'early_stopping':[True,False],
             #'validation_fraction':[0.1,0.05,0.2],
             #'max_iter':[200,1000,2000]
         }
     elif model_type == 'xgb':
         self.models.append((model_type, xgb()))
         self.param_grid[model_type] = {
             'max_depth': range(5, 15, 2),
             'min_child_weight': range(1, 6, 2),
             'n_estimators': range(10, 50, 10),
             'learning_rate': [0.01, 0.05, 0.1],
             'n_jobs': [4],
             'reg_alpha': [0, 0.005, 0.01],
             'subsample': [0.8, 1],
             'colsample_bytree': [0.8, 1]
         }
     elif model_type == 'rf':
         self.models.append((model_type, rf()))
         self.param_grid[model_type] = {
             'n_estimators': [10, 100, 500],
             #'max_depth':range(3,10,2),
             #'min_child_weight':range(1,6,2),
             #'learning_rate':[0.01,0.05,0.1]
         }
示例#8
0
def init_model(modeltype):
    if modeltype == 'mlp':
        ### Feedforward Neural Network Regression Model
        regression_model = mlp(hidden_layer_sizes=(100, 50),
                               activation='relu',
                               solver='adam',
                               alpha=0.5,
                               batch_size='auto',
                               learning_rate='adaptive',
                               learning_rate_init=0.001,
                               power_t=0.5,
                               max_iter=1000,
                               shuffle=True,
                               random_state=None,
                               tol=0.0001,
                               verbose=False,
                               warm_start=False,
                               momentum=0.9,
                               nesterovs_momentum=True,
                               early_stopping=False,
                               validation_fraction=0.1,
                               beta_1=0.9,
                               beta_2=0.999,
                               epsilon=1e-08,
                               n_iter_no_change=10)
    elif modeltype == 'svm':
        ### Support Vector Machine Regression Model
        regression_model = svm(kernel='rbf',
                               C=1e6,
                               epsilon=0.1,
                               gamma='auto',
                               tol=0.001,
                               cache_size=2000,
                               shrinking=True,
                               verbose=False,
                               max_iter=-1)
    return regression_model
示例#9
0
def get_esti():
    graphs = getgraphs()[:100]
    vectors = vectorize(graphs)
    return svm(kernel='linear').fit(vectors)
示例#10
0
from sklearn.svm import SVC as svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

data_set = ds.load_digits()

x = data_set.data
y = data_set.target

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

neighbors_model = KNeighborsClassifier(n_neighbors=3)
bayes_model = naive_bayes()
tree_model = tree()
svm_model = svm()
forest_model = RandomForestClassifier()

neighbors_model.fit(X_train, y_train)
bayes_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)

y_actual_neighbors = neighbors_model.predict(X_test)
y_actual_bayes = bayes_model.predict(X_test)
y_actual_tree = tree_model.predict(X_test)
y_actual_svm = svm_model.predict(X_test)
y_actual_forest = forest_model.predict(X_test)

neighbors_metrics = metrics.classification_report(y_test, y_actual_neighbors)
import sklearn.svm.SVR as svm

svr = svm(kernal="linear", C=1.0)
 def __init__(self, feat, classes):
     self.model = svm(kernel='rbf', gamma=1e-4, C=1e+5)
     self.model.fit(feat, classes)
示例#13
0
x = data_set.data
y = data_set.target

cv_kfold = KFold(n_splits=30)

neighbors_classifiers = []
bayes_classifiers = []
tree_classifiers = []
svm_classifiers = []
forest_classifiers = []

for train_index, test_index in cv_kfold.split(y):
    neighbors_model = KNeighborsClassifier(n_neighbors=3)
    bayes_model = naive_bayes()
    tree_model = tree()
    svm_model = svm()
    forest_model = RandomForestClassifier()
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    neighbors_model.fit(X_train, y_train)
    bayes_model.fit(X_train, y_train)
    tree_model.fit(X_train, y_train)
    svm_model.fit(X_train, y_train)
    forest_model.fit(X_train, y_train)

    neighbors_classifiers.append(neighbors_model)
    bayes_classifiers.append(bayes_model)
    tree_classifiers.append(tree_model)
    svm_classifiers.append(svm_model)

cross_neighbors = cross_val_score(KNeighborsClassifier(n_neighbors=3),
示例#14
0
def wordGraph_train():
        from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.svm import SVC as svm

        from sklearn.metrics import f1_score, precision_score, recall_score
        from sklearn.cross_validation import train_test_split
        import pickle

        imdir = '../../icdar2013/task21_22/train/image/'
        worddir = '../../icdar2013/task21_22/train/word_label/'
        chardir = '../../icdar2013/task21_22/train/char_label/'

        mywordparser = parseWord2013()
        mycharparser = parseChar2013()
        wordDataList = mywordparser.parseData(imdir, worddir)
        charDataList = mycharparser.parseData(imdir, chardir)
        DataList = cwCombine(wordDataList, charDataList)

        train_feature, train_label = plotbb_train(DataList)
        train_feature = numpy.asarray(train_feature)
        train_label = numpy.asarray(train_label)
        numpy.save('train_feature_seg', train_feature, )
        numpy.save('train_label_seg', train_label, )

        train_feature_width = train_feature[:, :, 2]
        train_feature_height = train_feature[:, :, 3]
        train_feature_area = train_feature[:, :, 2] * train_feature[:, :, 3]
        train_feature_aspectRatio = numpy.float32(train_feature_width) / numpy.float32(train_feature_height)
        train_feature_x = train_feature[:, :, 0]
        train_feature_y = train_feature[:, :, 1]
        train_feature_cx = train_feature_x+train_feature_width/2
        train_feature_cy = train_feature_y+train_feature_height/2
        edge_width_dif = abs(train_feature_width[:, 0] - train_feature_width[:, 1])
        edge_height_dif = abs(train_feature_height[:, 0] - train_feature_height[:, 1])
        edge_area_dif = abs(train_feature_area[:, 0] - train_feature_area[:, 1])
        edge_aspectRatio_dif = abs(train_feature_aspectRatio[:, 0] - train_feature_aspectRatio[:, 1])
        edge_cx_dis = abs(train_feature_cx[:, 0] - train_feature_cx[:, 1])
        edge_cy_dis = abs(train_feature_cy[:, 0] - train_feature_cy[:, 1])
        edge_width_mean = abs(train_feature_width[:, 0] + train_feature_width[:, 1]) / 2
        edge_height_mean = abs(train_feature_height[:, 0] + train_feature_height[:, 1]) / 2
        edge_area_mean = abs(train_feature_area[:, 0] + train_feature_area[:, 1]) / 2
        edge_aspectRatio_mean = abs(train_feature_aspectRatio[:, 0] + train_feature_aspectRatio[:, 1]) / 2
        edge_cu_dis = numpy.power(numpy.power(edge_cx_dis,2)+ numpy.power(edge_cy_dis, 2),0.5) # center Euclidean distance
        edge_bd_dis = numpy.maximum(edge_cx_dis - edge_width_mean, edge_cy_dis - edge_height_mean)# closest boundary distance
        edge_cu_dis_norm = numpy.float32(edge_cu_dis) / numpy.float32(edge_area_mean)
        edge_bd_dis_norm = numpy.float32(edge_bd_dis) / numpy.float32(edge_area_mean)
        edge_width_dif_norm = numpy.float32(edge_width_dif) / numpy.float32(edge_width_mean)
        edge_height_dif_norm = numpy.float32(edge_height_dif) / numpy.float32(edge_height_mean)
        edge_cx_dis_norm = numpy.float32(edge_cx_dis) / numpy.float32(edge_width_mean)
        edge_cy_dis_norm = numpy.float32(edge_cy_dis) / numpy.float32(edge_height_mean)
        edge_angle_dis = numpy.arctan(numpy.float32(edge_cx_dis) / numpy.float32(edge_cy_dis))

        feature = numpy.asarray([edge_cu_dis, edge_cu_dis_norm, edge_bd_dis, edge_bd_dis_norm, edge_cx_dis, edge_cx_dis_norm,
                edge_cy_dis, edge_cy_dis_norm, edge_width_dif, edge_width_dif_norm, 
                edge_height_dif, edge_height_dif_norm, edge_angle_dis]).T
        # replace NaN or Inf with average values ###########
        feature_mean = numpy.mean(feature, axis = 0)
        for i in range(feature.shape[0]):
                for j in range(feature.shape[1]):
                        if numpy.isnan(feature[i, j]) or numpy.isinf(feature[i, j]):
                                print 'NaN(Inf) found!'
                                feature[i, j] = feature_mean[j]

        print 'w/o sample equalization'
        f_train, f_test, l_train, l_test = train_test_split(feature, train_label, test_size = 0.2)

        classifier = GradientBoostingClassifier(max_depth = 1)
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('adaboost_unequal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'AdaBoost classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'


        classifier = svm(kernel = 'linear')
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('svm_unequal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'SVM classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        classifier = RandomForestClassifier()
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('randomForest_unequal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'Random Forest classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        print 'Equalized Samples: SMOTE algorithm'
        feature0 = feature[train_label == 0, ...]
        label0 = train_label[train_label == 0]

        feature1 = feature[train_label == 1, ...]
        label1 = train_label[train_label == 1]

        feature0 = SMOTE(feature0, len(feature1)/len(feature0)*100, 3)
        label0 = numpy.zeros(len(feature0))

        f_train = numpy.concatenate([feature0, feature1])
        l_train = numpy.concatenate([label0, label1])

        classifier = GradientBoostingClassifier(max_depth = 1)
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('adaboost_smote_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'AdaBoost classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        classifier = svm(kernel = 'linear')
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('svm_smote_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'SVM classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        classifier = RandomForestClassifier()
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('randomForest_smote_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'Random Forest classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        print 'Sample equalization with random selection'
        feature0 = feature[train_label == 0, ...]
        label0 = train_label[train_label == 0]

        feature1 = feature[train_label == 1, ...]
        label1 = train_label[train_label == 1]

        idx = numpy.random.choice(feature1.shape[0], feature0.shape[0], replace = False)
        feature1 = feature1[idx, ...]
        label1 = label1[idx, ...]

        f_train = numpy.concatenate([feature0, feature1])
        l_train = numpy.concatenate([label0, label1])

        classifier = GradientBoostingClassifier(max_depth = 1)
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('adaboost_equal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'AdaBoost classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        classifier = svm(kernel = 'linear')
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('svm_equal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'SVM classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'

        classifier = RandomForestClassifier()
        classifier = classifier.fit(f_train, l_train)
        pickle.dump(classifier, open('randomForest_equal_seg.pkl', 'w'))

        pred_train = classifier.predict(f_train)
        pred_test = classifier.predict(f_test)

        f1_train = f1_score(l_train, pred_train)
        precision_train = precision_score(l_train, pred_train)
        recall_train = recall_score(l_train, pred_train)

        f1_test = f1_score(l_test, pred_test)
        precision_test = precision_score(l_test, pred_test)
        recall_test = recall_score(l_test, pred_test)

        print 'Random Forest classifier training(testing):'
        print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
        print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
        print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
        print '\r\n'
                
        return
示例#15
0
def main():
    train, test = load("cancer-data-train.csv"), load("cancer-data-test.csv")
    X_train, y_train = train
    X_test, y_test = test
    X_train, X_test, print_pred = arguments(sys.argv, X_train, X_test)
    fig = plot.figure()

    # Passing training data and classes to find best C and number of leaf nodes to use. Also creating graphs to display this info
    classifier_plotter(X_train, y_train)

    # Setting up graphs for each plot
    ax1 = fig.add_subplot(234)
    ax1.set_title('Average Precsion Scores')
    ax1.set_ylabel('Precsion Score')
    ax1.set_xlabel('Classifier')
    ax2 = fig.add_subplot(235)
    ax2.set_title('Average Recall Scores')
    ax2.set_ylabel('Recall Score')
    ax2.set_xlabel('Classifier')
    ax3 = fig.add_subplot(236)
    ax3.set_title('Average F-measures')
    ax3.set_ylabel('F-measure')
    ax3.set_xlabel('Classifier')

    # Create and train the classifiers
    classifier_svm, classifier_gini, classifier_ig, classifier_lda = svm(
        kernel='linear',
        C=0.1), dt(criterion='gini',
                   max_leaf_nodes=10), dt(criterion='entropy',
                                          max_leaf_nodes=5), lda()
    classifier_svm.fit(X_train, y_train), classifier_gini.fit(
        X_train, y_train), classifier_ig.fit(X_train,
                                             y_train), classifier_lda.fit(
                                                 X_train, y_train)

    # Make the predictions
    pred_svm, pred_gini, pred_ig, pred_lda = classifier_svm.predict(
        X_test), classifier_gini.predict(X_test), classifier_ig.predict(
            X_test), classifier_lda.predict(X_test)

    # Calculate the precision, recall, f-measure
    avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda = average_precision_score(
        y_test, pred_svm), average_precision_score(
            y_test, pred_gini), average_precision_score(
                y_test, pred_ig), average_precision_score(y_test, pred_lda)
    recall_svm, recall_gini, recall_ig, recall_lda = recall_score(
        y_test, pred_svm, average='weighted'), recall_score(
            y_test, pred_gini, average='weighted'), recall_score(
                y_test, pred_ig,
                average='weighted'), recall_score(y_test,
                                                  pred_lda,
                                                  average='weighted')
    f_svm, f_gini, f_ig, f_lda = f1_score(
        y_test, pred_svm, average='weighted'), f1_score(
            y_test, pred_gini, average='weighted'), f1_score(
                y_test, pred_ig,
                average='weighted'), f1_score(y_test,
                                              pred_lda,
                                              average='weighted')

    ################## Extra Credit #########################
    # Train classifier and make predictions on test set
    classifier_rfc = rfc(n_estimators=100, max_depth=2)
    classifier_rfc.fit(X_train, y_train)
    pred_rfc = classifier_rfc.predict(X_test)

    #Calculate precision, recall and f-measure for Random Forest Classifier
    avg_precision_rfc = average_precision_score(y_test, pred_rfc)
    recall_rfc = recall_score(y_test, pred_rfc, average='weighted')
    f_rfc = f1_score(y_test, pred_rfc, average='weighted')
    #########################################################

    # Printing scores and predictions
    print_scores([[
        avg_precision_svm, avg_precision_gini, avg_precision_ig,
        avg_precision_lda, avg_precision_rfc
    ], [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc],
                  [f_svm, f_gini, f_ig, f_lda, f_rfc]])
    print_predictions([pred_svm, pred_gini, pred_ig, pred_lda, pred_rfc],
                      print_pred)

    # Create the graphs for the scores
    score_plotter(ax1, [
        avg_precision_svm, avg_precision_gini, avg_precision_ig,
        avg_precision_lda, avg_precision_rfc
    ])
    score_plotter(ax2,
                  [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc])
    score_plotter(ax3, [f_svm, f_gini, f_ig, f_lda, f_rfc])

    plot.tight_layout(w_pad=1.5, h_pad=2.0)
    plot.show()
示例#16
0
def classifier_plotter(X_train, y_train):
    '''
	Takes the training data and runs through SVM, DT-Gini and DT-IG with multiple C values and max_leaf_nodes to try.
	The method then creates a graph by taking the average of cross validation scores for that C value or max_leaf_node.

	Params:
	X_train: 
		List/s of features already standardized from the initial dataset
	y_train: 
		List of classifiers for X_train taken from the original dataset

	Return:
	Outputs a graph of the average cross validation scores.
	'''
    i, d = 1, 0

    # Values to test
    c_values = [0.01, 0.1, 1, 10, 100]
    k_values = [2, 5, 10, 20]
    classifiers = ["SVM", "DT-Gini & DT-IG"]

    for clf in classifiers:
        count = 1
        if clf == "SVM":
            if d == 0:
                ax = plot.subplot(231)
                ax.set_title(clf)
                plot.ylabel('F-measure')
                plot.xlabel('C values')
                d += 1
            print('SVM')
            for c in c_values:
                classi = svm(kernel='linear', C=c).fit(X_train, y_train)
                scores = cross_val_score(classi, X_train, y_train, cv=10)
                ax.plot(str(c), scores.mean(), 'bs')
                print('%d.) %.4f%%' % (count, scores.mean() * 100))
                count += 1
            plot.axis([None, None, 0.90, 1])
            print('\n')
            i += 1
            d = 0

        elif clf == "DT-Gini & DT-IG":
            count = 1
            if d == 0:
                ax = plot.subplot(232)
                plot.ylabel('F-measure')
                plot.xlabel('Max Leaf Nodes')
            print('    Gini\tIG')
            for k in k_values:
                gini_class, ig_class = dt(criterion='gini',
                                          max_leaf_nodes=k), dt(
                                              criterion='entropy',
                                              max_leaf_nodes=k)
                score_gini, score_ig = cross_val_score(gini_class,
                                                       X_train,
                                                       y_train,
                                                       cv=10), cross_val_score(
                                                           ig_class,
                                                           X_train,
                                                           y_train,
                                                           cv=10)
                ax.plot(str(k), score_gini.mean(), 'r.', str(k),
                        score_ig.mean(), 'g.')
                print('%d.) %.4f%%\t%.4f%%' %
                      (count, score_gini.mean() * 100, score_ig.mean() * 100))
                count += 1
            plot.axis([None, None, 0.889, 0.96])
            ax.legend(('Gini', 'IG'), loc=2)
            print('\n')
            i += 1
            d = 0

        else:
            return "Should not get here."
示例#17
0
import glob
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.svm import LinearSVC as svm
import pickle

li=[]
for infile in glob.glob('*/*'):
    a=cv2.imread(infile)
    gray=cv2.cvtColor(a,cv2.COLOR_BGR2GRAY)
    li.append(cv2.resize(gray,(350,350)).flatten())
li=np.asarray(li)

x=li[0]
plt.imshow(x.reshape(350,350))

y=[]
for infile in glob.glob('*/*'):
    path=infile.split('/')
    y.append(path[0])
y=np.asarray(y)


clf=knn()
clf2=svm()
clf2.fit(li,y)

pickle.dump(clf2,open("model",'wb'))

示例#18
0
def wordGraph_train():
    from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC as svm

    from sklearn.metrics import f1_score, precision_score, recall_score
    from sklearn.cross_validation import train_test_split
    import pickle

    imdir = '../../icdar2013/task21_22/train/image/'
    worddir = '../../icdar2013/task21_22/train/word_label/'
    chardir = '../../icdar2013/task21_22/train/char_label/'

    mywordparser = parseWord2013()
    mycharparser = parseChar2013()
    wordDataList = mywordparser.parseData(imdir, worddir)
    charDataList = mycharparser.parseData(imdir, chardir)
    DataList = cwCombine(wordDataList, charDataList)

    train_feature, train_label = plotbb_train(DataList)
    train_feature = numpy.asarray(train_feature)
    train_label = numpy.asarray(train_label)
    numpy.save(
        'train_feature_seg',
        train_feature,
    )
    numpy.save(
        'train_label_seg',
        train_label,
    )

    train_feature_width = train_feature[:, :, 2]
    train_feature_height = train_feature[:, :, 3]
    train_feature_area = train_feature[:, :, 2] * train_feature[:, :, 3]
    train_feature_aspectRatio = numpy.float32(
        train_feature_width) / numpy.float32(train_feature_height)
    train_feature_x = train_feature[:, :, 0]
    train_feature_y = train_feature[:, :, 1]
    train_feature_cx = train_feature_x + train_feature_width / 2
    train_feature_cy = train_feature_y + train_feature_height / 2
    edge_width_dif = abs(train_feature_width[:, 0] - train_feature_width[:, 1])
    edge_height_dif = abs(train_feature_height[:, 0] -
                          train_feature_height[:, 1])
    edge_area_dif = abs(train_feature_area[:, 0] - train_feature_area[:, 1])
    edge_aspectRatio_dif = abs(train_feature_aspectRatio[:, 0] -
                               train_feature_aspectRatio[:, 1])
    edge_cx_dis = abs(train_feature_cx[:, 0] - train_feature_cx[:, 1])
    edge_cy_dis = abs(train_feature_cy[:, 0] - train_feature_cy[:, 1])
    edge_width_mean = abs(train_feature_width[:, 0] +
                          train_feature_width[:, 1]) / 2
    edge_height_mean = abs(train_feature_height[:, 0] +
                           train_feature_height[:, 1]) / 2
    edge_area_mean = abs(train_feature_area[:, 0] +
                         train_feature_area[:, 1]) / 2
    edge_aspectRatio_mean = abs(train_feature_aspectRatio[:, 0] +
                                train_feature_aspectRatio[:, 1]) / 2
    edge_cu_dis = numpy.power(
        numpy.power(edge_cx_dis, 2) + numpy.power(edge_cy_dis, 2),
        0.5)  # center Euclidean distance
    edge_bd_dis = numpy.maximum(edge_cx_dis - edge_width_mean, edge_cy_dis -
                                edge_height_mean)  # closest boundary distance
    edge_cu_dis_norm = numpy.float32(edge_cu_dis) / numpy.float32(
        edge_area_mean)
    edge_bd_dis_norm = numpy.float32(edge_bd_dis) / numpy.float32(
        edge_area_mean)
    edge_width_dif_norm = numpy.float32(edge_width_dif) / numpy.float32(
        edge_width_mean)
    edge_height_dif_norm = numpy.float32(edge_height_dif) / numpy.float32(
        edge_height_mean)
    edge_cx_dis_norm = numpy.float32(edge_cx_dis) / numpy.float32(
        edge_width_mean)
    edge_cy_dis_norm = numpy.float32(edge_cy_dis) / numpy.float32(
        edge_height_mean)
    edge_angle_dis = numpy.arctan(
        numpy.float32(edge_cx_dis) / numpy.float32(edge_cy_dis))

    feature = numpy.asarray([
        edge_cu_dis, edge_cu_dis_norm, edge_bd_dis, edge_bd_dis_norm,
        edge_cx_dis, edge_cx_dis_norm, edge_cy_dis, edge_cy_dis_norm,
        edge_width_dif, edge_width_dif_norm, edge_height_dif,
        edge_height_dif_norm, edge_angle_dis
    ]).T
    # replace NaN or Inf with average values ###########
    feature_mean = numpy.mean(feature, axis=0)
    for i in range(feature.shape[0]):
        for j in range(feature.shape[1]):
            if numpy.isnan(feature[i, j]) or numpy.isinf(feature[i, j]):
                print 'NaN(Inf) found!'
                feature[i, j] = feature_mean[j]

    print 'w/o sample equalization'
    f_train, f_test, l_train, l_test = train_test_split(feature,
                                                        train_label,
                                                        test_size=0.2)

    classifier = GradientBoostingClassifier(max_depth=1)
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('adaboost_unequal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'AdaBoost classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = svm(kernel='linear')
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('svm_unequal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'SVM classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = RandomForestClassifier()
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('randomForest_unequal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'Random Forest classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    print 'Equalized Samples: SMOTE algorithm'
    feature0 = feature[train_label == 0, ...]
    label0 = train_label[train_label == 0]

    feature1 = feature[train_label == 1, ...]
    label1 = train_label[train_label == 1]

    feature0 = SMOTE(feature0, len(feature1) / len(feature0) * 100, 3)
    label0 = numpy.zeros(len(feature0))

    f_train = numpy.concatenate([feature0, feature1])
    l_train = numpy.concatenate([label0, label1])

    classifier = GradientBoostingClassifier(max_depth=1)
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('adaboost_smote_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'AdaBoost classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = svm(kernel='linear')
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('svm_smote_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'SVM classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = RandomForestClassifier()
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('randomForest_smote_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'Random Forest classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    print 'Sample equalization with random selection'
    feature0 = feature[train_label == 0, ...]
    label0 = train_label[train_label == 0]

    feature1 = feature[train_label == 1, ...]
    label1 = train_label[train_label == 1]

    idx = numpy.random.choice(feature1.shape[0],
                              feature0.shape[0],
                              replace=False)
    feature1 = feature1[idx, ...]
    label1 = label1[idx, ...]

    f_train = numpy.concatenate([feature0, feature1])
    l_train = numpy.concatenate([label0, label1])

    classifier = GradientBoostingClassifier(max_depth=1)
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('adaboost_equal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'AdaBoost classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = svm(kernel='linear')
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('svm_equal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'SVM classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    classifier = RandomForestClassifier()
    classifier = classifier.fit(f_train, l_train)
    pickle.dump(classifier, open('randomForest_equal_seg.pkl', 'w'))

    pred_train = classifier.predict(f_train)
    pred_test = classifier.predict(f_test)

    f1_train = f1_score(l_train, pred_train)
    precision_train = precision_score(l_train, pred_train)
    recall_train = recall_score(l_train, pred_train)

    f1_test = f1_score(l_test, pred_test)
    precision_test = precision_score(l_test, pred_test)
    recall_test = recall_score(l_test, pred_test)

    print 'Random Forest classifier training(testing):'
    print 'precision', '%.4f' % precision_train, '(', '%.4f' % precision_test, ')'
    print 'recall', '%.4f' % recall_train, '(', '%.4f' % recall_test, ')'
    print 'f1 score', '%.4f' % f1_train, '(', '%.4f' % f1_test, ')'
    print '\r\n'

    return