def _select(feature_file,
            feature_file_path,
            train_samples_range,
            num_replicates,
            num_test_samples,
            models,
            output_dir,
            config_file=None):
    """helper function to do model selection, used with either config_file or
    another set of arguments passed to hvc.select"""
    labels = np.asarray(feature_file['labels'])
    # call grab_n_samples this first time to get indices for test/validation set
    # and a list of song IDs from which we will draw the training set indices below
    test_IDs, train_song_ID_list = grab_n_samples_by_song(
        feature_file['songfile_IDs'],
        feature_file['labels'],
        num_test_samples,
        return_popped_songlist=True)
    test_labels = labels[test_IDs]

    score_arr = np.zeros(
        (len(train_samples_range), len(range(num_replicates)), len(models)))
    avg_acc_arr = np.zeros(
        (len(train_samples_range), len(range(num_replicates)), len(models)))
    pred_labels_arr = np.empty(
        (len(train_samples_range), len(range(num_replicates)), len(models)),
        dtype='O')
    train_IDs_arr = np.empty(
        (len(train_samples_range), len(range(num_replicates))), dtype='O')

    for num_samples_ind, num_train_samples in enumerate(train_samples_range):
        for replicate in range(num_replicates):
            print('Training models with {0} samples, replicate #{1}'.format(
                num_train_samples, replicate))
            # here we call grab_n_samples again with the train_song_ID_list
            # from above. currently each fold is a random grab without
            # anything like k-folds.
            # For testing on large datasets this is okay but in situations
            # where we're data-limited it's less than ideal, the whole point
            # is to not have to hand-label a large data set
            train_IDs = grab_n_samples_by_song(feature_file['songfile_IDs'],
                                               feature_file['labels'],
                                               num_train_samples,
                                               song_ID_list=train_song_ID_list)
            train_IDs_arr[num_samples_ind, replicate] = train_IDs
            train_labels = labels[train_IDs]
            for model_ind, model_dict in enumerate(models):

                # lazy-imports to avoid loading all of
                # scikit-learn and tensorflow if possible
                if model_dict['model_name'] == 'svm':
                    if 'SVC' not in locals():
                        from sklearn.svm import SVC

                elif model_dict['model_name'] == 'knn':
                    if 'neighbors' not in locals():
                        from sklearn import neighbors

                elif model_dict['model_name'] == 'flatwindow':
                    if 'flatwindow' not in locals():
                        from hvc.neuralnet.models.flatwindow import flatwindow
                        from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping

                # save info associated with model such as indices of training samples.
                # Note this is done outside the if/elif list for switching between
                # models.
                model_output_dir = os.path.join(
                    output_dir, determine_model_output_folder_name(model_dict))
                if not os.path.isdir(model_output_dir):
                    os.makedirs(model_output_dir)

                model_fname_str = \
                    '{0}_{1}samples_replicate{2}.model'.format(model_dict['model_name'],
                                                               num_train_samples,
                                                               replicate)
                model_filename = os.path.join(model_output_dir,
                                              model_fname_str)

                # if-elif that switches based on model type,
                # start with sklearn models
                if model_dict['model_name'] in model_types['sklearn']:

                    # if model_dict specifies using a certain feature group
                    if 'feature_group' in model_dict:
                        # determine if we already figured out which features belong to that feature group.
                        # Can only do that if model_dict defined for todo_list, not if model_dict defined
                        # at top level of select config file
                        if 'feature_list_indices' in model_dict:
                            feature_inds = np.in1d(
                                feature_file['features_arr_column_IDs'],
                                model_dict['feature_list_indices'])
                        else:  # have to figure out feature list indices
                            ftr_grp_ID_dict = feature_file[
                                'feature_group_ID_dict']
                            ftr_list_grp_ID = feature_file[
                                'feature_list_group_ID']
                            # figure out what they are by finding ID # corresponding to feature
                            # group or groups in ID_dict, and then finding all the indices in the
                            # feature list that have that group ID #, using ftr_list_grp_ID, a list
                            # the same length as feature list where each element indicates whether
                            # the element with the same index in the feature list belongs to a
                            # feature group and if so which one, by ID #
                            if type(model_dict['feature_group']) == str:
                                ftr_grp_ID = ftr_grp_ID_dict[
                                    model_dict['feature_group']]
                                # now find all the indices of features associated with the
                                # feature group for that model
                                ftr_list_inds = [
                                    ind
                                    for ind, val in enumerate(ftr_list_grp_ID)
                                    if val == ftr_grp_ID
                                ]

                            # if user specified more than one feature group
                            elif type(model_dict['feature_group']) == list:
                                ftr_list_inds = []
                                for ftr_grp in model_dict['feature_group']:
                                    ftr_grp_ID = ftr_grp_ID_dict[ftr_grp]
                                    # now find all the indices of features associated with the
                                    # feature group for that model
                                    ftr_list_inds.extend([
                                        ind for ind, val in enumerate(
                                            ftr_list_grp_ID)
                                        if val == ftr_grp_ID
                                    ])
                            # finally use ftr_list_inds to get the actual columns we need from the
                            # features array. Need to this because multiple columns might belong to
                            # the same feature, e.g. if the feature is a spectrum
                            feature_inds = np.in1d(
                                feature_file['features_arr_column_IDs'],
                                ftr_list_inds)
                            # put feature list indices in model dict so we have it later when
                            # saving summary file
                            model_dict['feature_list_indices'] = ftr_list_inds

                    elif 'feature_list_indices' in model_dict and\
                            'feature_group' not in model_dict:
                        # if no feature group in model dict, use feature list indices
                        # Note that for neuralnet models, there will be neither
                        if type(model_dict['feature_list_indices']) is str:
                            if model_dict['feature_list_indices'] == 'all':
                                feature_inds = np.ones(
                                    (feature_file['features_arr_column_IDs'].
                                     shape[-1], )).astype(bool)
                            else:
                                raise ValueError(
                                    'received invalid string for feature_list_indices: {}'
                                    .format(
                                        model_dict['feature_list_indices']))
                        else:
                            # use 'feature_list_indices' from model_dict to get the actual columns
                            # we need from the features array. Again, need to this because multiple
                            # columns might belong to the same feature,
                            # e.g. if the feature is a spectrum
                            feature_inds = np.in1d(
                                feature_file['features_arr_column_IDs'],
                                model_dict['feature_list_indices'])

                    if model_dict['model_name'] == 'svm':
                        print('training svm. ', end='')
                        clf = SVC(C=model_dict['hyperparameters']['C'],
                                  gamma=model_dict['hyperparameters']['gamma'],
                                  decision_function_shape='ovr',
                                  probability=model_dict['predict_proba'])

                    elif model_dict['model_name'] == 'knn':
                        print('training knn. ', end='')
                        clf = neighbors.KNeighborsClassifier(
                            model_dict['hyperparameters']['k'], 'distance')

                    #use 'advanced indexing' to get only sample rows and only feature models
                    features_train = feature_file['features'][
                        train_IDs[:, np.newaxis], feature_inds]
                    scaler = StandardScaler()
                    features_train = scaler.fit_transform(features_train)

                    features_test = feature_file['features'][
                        test_IDs[:, np.newaxis], feature_inds]
                    features_test = scaler.transform(features_test)

                    print('fitting model. ', end='')
                    clf.fit(features_train, train_labels)
                    score = clf.score(features_test, test_labels)
                    print('score on test set: {:05.4f} '.format(score), end='')
                    score_arr[num_samples_ind, replicate, model_ind] = score
                    pred_labels = clf.predict(features_test)
                    pred_labels_arr[num_samples_ind, replicate,
                                    model_ind] = pred_labels
                    acc_by_label, avg_acc = get_acc_by_label(
                        test_labels, pred_labels,
                        feature_file['labels_to_use'])
                    print(', average accuracy on test set: {:05.4f}'.format(
                        avg_acc))
                    avg_acc_arr[num_samples_ind, replicate,
                                model_ind] = avg_acc
                    joblib.dump(clf, model_filename)

                # this is the middle of the if-elif that switches based on model type
                # end sklearn, start keras models
                elif model_dict['model_name'] in model_types['keras']:
                    if 'neuralnet_input' in model_dict:
                        neuralnet_input = model_dict['neuralnet_input']
                        spects = feature_file['neuralnet_inputs'][
                            neuralnet_input]
                    else:
                        # if not specified, assume that input should be the one that
                        # corresponds to the neural net model being trained
                        neuralnet_input = model_dict['model_name']
                        try:
                            spects = feature_file['neuralnet_inputs'][
                                neuralnet_input]
                        except KeyError:
                            raise KeyError(
                                'no input specified for model {}, and '
                                'input type for that model was not found in '
                                'feature file'.format(
                                    model_dict['model_name']))

                    if 'SpectScaler' not in locals():
                        from hvc.neuralnet.utils import SpectScaler

                    if 'test_labels_onehot' not in locals():
                        from sklearn.preprocessing import LabelBinarizer
                        label_binarizer = LabelBinarizer()
                        test_labels_onehot = label_binarizer.fit_transform(
                            test_labels)

                    if 'test_spects' not in locals():
                        # get spects for test set,
                        # also add axis so correct input_shape for keras.conv_2d
                        test_spects = spects[test_IDs, :, :]

                    train_labels_onehot = label_binarizer.transform(
                        train_labels)

                    # get spects for train set,
                    # also add axis so correct input_shape for keras.conv_2d
                    train_spects = spects[train_IDs, :, :]

                    # scale all spects by mean and std of training set
                    spect_scaler = SpectScaler()
                    # concatenate all spects then rotate so
                    # Hz bins are columns, i.e., 'features'
                    spect_scaler.fit(train_spects)
                    train_spects_scaled = spect_scaler.transform(train_spects)
                    test_spects_scaled = spect_scaler.transform(test_spects)

                    # have to add 'channels' axis for keras 2-d convolution
                    # even though these are spectrograms, don't have channels
                    # like an image would.
                    # Decided to leave it explicit here instead of hiding in a function
                    train_spects_scaled = train_spects_scaled[:, :, :,
                                                              np.newaxis]
                    test_spects_scaled = test_spects_scaled[:, :, :,
                                                            np.newaxis]

                    num_samples, num_freqbins, num_timebins, num_channels = \
                        train_spects_scaled.shape
                    num_label_classes = len(feature_file['labels_to_use'])
                    input_shape = (num_freqbins, num_timebins, num_channels)
                    flatwin = flatwindow(input_shape=input_shape,
                                         num_label_classes=num_label_classes)

                    csv_str = ''.join([
                        'flatwindow_training_',
                        '{}_samples_'.format(num_train_samples),
                        'replicate_{}'.format(replicate), '.log'
                    ])
                    csv_filename = os.path.join(model_output_dir, csv_str)
                    csv_logger = CSVLogger(csv_filename,
                                           separator=',',
                                           append=True)

                    checkpoint = ModelCheckpoint(model_filename,
                                                 monitor='val_acc',
                                                 verbose=1,
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 mode='max')
                    earlystop = EarlyStopping(monitor='val_acc',
                                              min_delta=0,
                                              patience=20,
                                              verbose=1,
                                              mode='auto')
                    callbacks_list = [csv_logger, checkpoint, earlystop]

                    flatwin.fit(
                        train_spects_scaled,
                        train_labels_onehot,
                        validation_data=(test_spects_scaled,
                                         test_labels_onehot),
                        batch_size=model_dict['hyperparameters']['batch_size'],
                        epochs=model_dict['hyperparameters']['epochs'],
                        callbacks=callbacks_list,
                        verbose=1)

                    pred_labels = flatwin.predict(test_spects_scaled,
                                                  batch_size=32,
                                                  verbose=1)
                    pred_labels = label_binarizer.inverse_transform(
                        pred_labels)

                    score = accuracy_score(test_labels, pred_labels)
                    print('score on test set: {:05.4f} '.format(score), end='')
                    score_arr[num_samples_ind, replicate, model_ind] = score

                    pred_labels_arr[num_samples_ind, replicate,
                                    model_ind] = pred_labels

                    acc_by_label, avg_acc = get_acc_by_label(
                        test_labels, pred_labels,
                        feature_file['labels_to_use'])
                    print(', average accuracy on test set: {:05.4f}'.format(
                        avg_acc))
                    avg_acc_arr[num_samples_ind, replicate,
                                model_ind] = avg_acc

            model_meta_fname_str = \
                '{0}_{1}samples_replicate{2}.meta'.format(model_dict['model_name'],
                                                          num_train_samples,
                                                          replicate)
            model_meta_filename = os.path.join(model_output_dir,
                                               model_meta_fname_str)
            model_meta_output_dict = {
                'model_filename': model_filename,
                'config_file': config_file,
                'feature_file': feature_file_path,
                'test_IDs': test_IDs,
                'train_IDs': train_IDs,
                'model_name': model_dict['model_name'],
                'pred_labels': pred_labels,
                'test_labels': test_labels
            }

            if 'scaler' in locals():
                model_meta_output_dict['scaler'] = scaler
                # have to delete scaler
                # so it's not still in memory next loop
                # (e.g. because a different model that doesn't use scaler
                # is tested in next loop)
                del scaler
            elif 'spect_scaler' in locals():
                # neural net models uses scaler on spectrogram
                # instead of vanilla sklearn scalar
                model_meta_output_dict['spect_scaler'] = spect_scaler
                del spect_scaler

            if 'label_binarizer' in locals():
                model_meta_output_dict['label_binarizer'] = label_binarizer

            if model_dict['model_name'] in model_types['sklearn']:
                # to be able to extract features for predictions
                # on unlabeled data set, need list of features
                if ((type(model_dict['feature_list_indices']) is str)
                        and (model_dict['feature_list_indices'] == 'all')):
                    model_feature_list = feature_file['feature_list']
                else:
                    model_feature_list = [
                        feature_file['feature_list'][ind]
                        for ind in model_dict['feature_list_indices']
                    ]
                model_meta_output_dict['feature_list'] = model_feature_list
            elif model_dict['model_name'] in model_types['keras']:
                model_meta_output_dict['feature_list'] = [neuralnet_input]
            joblib.dump(model_meta_output_dict, model_meta_filename)

    # after looping through all samples + replicates
    output_filename = os.path.join(
        output_dir, 'summary_model_select_file_created_' + timestamp())
    select_summary_dict = {
        'config_file': config_file,
        'feature_file': feature_file_path,
        'train_samples_range': train_samples_range,
        'num_replicates': num_replicates,
        'model_dict': model_dict,
        'test_IDs': test_IDs,
        'train_IDs_arr': train_IDs_arr,
        'score_arr': score_arr,
        'avg_acc_arr': avg_acc_arr,
        'pred_labels_arr': pred_labels_arr,
    }
    joblib.dump(select_summary_dict, output_filename)
示例#2
0
def trainAlgo(imageArr, labelArr, DIR_NAME):

    X_train = []
    y_labels = []
    model_save_path = str(DIR_NAME) + "_knn.clf"
    n_neighbors = 3
    #model_save_path = None
    #n_neighbors = None
    knn_algo = 'ball_tree'
    verbose = False

    proto = "ML/deploy.prototxt.txt"
    caffmodel = "ML/res10_300x300_ssd_iter_140000.caffemodel"
    confid = 0.99

    net = cv2.dnn.readNetFromCaffe(proto, caffmodel)

    for x in range(len(imageArr)):
        #print("Training Identity " + labelArr[x] + " " + str(x))
        sys.stdout.write("\r" + str(x + 1) + " of " + str(len(imageArr)) +
                         " has been processed")
        sys.stdout.flush()
        try:
            count = 0
            imageA = np.array(imageArr[x])
            #imageRGB = cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB)
            (h, w) = imageA.shape[:2]
            blob = cv2.dnn.blobFromImage(cv2.resize(imageA, (300, 300)), 1.0,
                                         (300, 300), (104.0, 177.0, 123.0))

            net.setInput(blob)
            detections = net.forward()

            for i in range(0, detections.shape[2]):

                # print(detections.shape)
                count += 1
                confidence = detections[0, 0, i, 2]
                if confidence > confid and count == 1:
                    box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
                    (startX, startY, endX, endY) = box.astype("int")
                    #face_bounding_boxes = "("+startX+","+endX+","+startY+","+endY+")"
                    roi = imageA[startY:endY, startX:endX]
                    #print(face_recognition.face_encodings(roi))
                    X_train.append(face_recognition.face_encodings(roi)[0])
                    y_labels.append(labelArr[x])

        except Exception as e:
            print("")
            print(e)

    if n_neighbors is None:
        n_neighbors = int(round(math.sqrt(len(X))))
        if verbose:
            print("Chose n_neighbors automatically:", n_neighbors)

    # Create and train the KNN classifier
    knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors,
                                             algorithm=knn_algo,
                                             weights='distance')
    knn_clf.fit(X_train, y_labels)

    # Save the trained KNN classifier
    if model_save_path is not None:
        with open(model_save_path, 'wb') as f:
            pickle.dump(knn_clf, f)
        print("**Training Completed**")

    return knn_clf
示例#3
0
                               random_state=999)
# train, test = train_test_split(df, test_size=0.4, random_state=999)

print(type(train))
train.reset_index(inplace=True)
test.reset_index(inplace=True)
# print(test.to_string())

# shuffle = False si hay dimensión temporal
cv = KFold(n_splits=27, shuffle=False)

for i, weights in enumerate(['uniform', 'distance']):
    total_scores = []
    for n_neighbors in range(1, 30):
        fold_accuracy = []
        knn = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        for train_fold, test_fold in cv.split(train):
            # División train test aleatoria
            f_train = train.loc[train_fold]
            f_test = train.loc[test_fold]
            # entrenamiento y ejecución del modelo
            knn.fit(X=f_train.drop(['attack'], axis=1), y=f_train['attack'])
            y_pred = knn.predict(X=f_test.drop(['attack'], axis=1))
            # evaluación del modelo
            acc = accuracy_score(f_test['attack'], y_pred)
            fold_accuracy.append(acc)
        total_scores.append(sum(fold_accuracy) / len(fold_accuracy))

    plt.plot(range(1,
                   len(total_scores) + 1),
             total_scores,
示例#4
0
from sklearn import neighbors
from utilities import load_magic04, load_wine, scale_data, train_model, tune_hyperparameters, model_complexity, learning_curve

df, factors, response = load_wine()
# df, factors, response = load_magic04()
df_train, df_test = scale_data(df, response)

classifier = neighbors.KNeighborsClassifier(weights="distance")
train_model(classifier, df_train, None, factors, response)

best_params = tune_hyperparameters(classifier, df_train, factors, response, {
    "n_neighbors": range(1, 20),
    "p": range(1, 4)
})
# "n_neighbors": range(1, 20) "p": range(1, 4) "metric": ["minkowski","euclidean","manhattan","chebyshev"] "weights": ["uniform", "distance"]

model_complexity(
    neighbors.KNeighborsClassifier(weights="distance", p=best_params["p"]),
    df_train, factors, response, {"n_neighbors": range(1, 30)}, "n_neighbors")

classifier = neighbors.KNeighborsClassifier(
    weights="distance",
    n_neighbors=best_params["n_neighbors"],
    p=best_params["p"])
train_model(classifier, df_train, df_test, factors, response, "Final ")

learning_curve(classifier, df_train, factors, response)
示例#5
0
# slicing by using a two-dim dataset
X = iris.data[:, :2]
print(X)
print(X.shape)
y = iris.target
print(y)
print(y.shape)
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)
    joblib.dump(clf, 'model.pkl')
    clf = joblib.load('model.pkl')
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
h = .02

#optimal number of neighbors
n_neighbors = 31

#color maps
from matplotlib.colors import ListedColormap

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

from sklearn import neighbors

for weights in ['uniform', 'distance']:
    #create instance of KNNClassifier and fit
    clf = neighbors.KNeighborsClassifier(n_neighbors, algorithm='ball_tree', weights=weights)
    clf.fit(x_pca, y)

    #plot decision boundary; assign color to each
    #point in the mesh [x_min, x_max] x [y_min, y_max]
    x_min, x_max = x_pca[:, 0].min() - 1, x_pca[:, 0].max() + 1
    y_min, y_max = x_pca[:, 1].min() - 1, x_pca[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    #put result in color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
def draw_k_nearest(polarity=None,
                   subjectivity=None,
                   names_dem=None,
                   names_rep=None,
                   k_neighbors=3):
    '''
    This function generates an image showing the results of a KNN on an input scatterplot
    :param polarity: dictionary containing the average polarity values of each person
    :param subjectivity: dictionary containing average subjectivity of each person
    :param names_dem: list of names of all democrats in dataset
    :params names_rep: list of names of all republicans in dataset
    :param k_neighbors: K value for KNN calculation
    :return: None, image shown on screen
    '''
    assert isinstance(polarity, dict)
    assert isinstance(subjectivity, dict)
    assert isinstance(names_dem, list)
    assert isinstance(names_rep, list)
    assert all(isinstance(i, str) for i in names_dem)
    assert all(isinstance(i, str) for i in names_rep)
    assert isinstance(k_neighbors, int)
    assert k_neighbors > 0

    cmap_back = ListedColormap(['#00AAFF', '#FFAAAA'])
    cmap_scatter_dem = ListedColormap(['b'])
    cmap_scatter_rep = ListedColormap(['#FF0000'])

    X = []
    y = []
    x_dem = []
    x_rep = []
    for name in names_dem:
        X.append([polarity[name], subjectivity[name]])
        x_dem.append([polarity[name], subjectivity[name]])
        y.append(0)
    for name in names_rep:
        X.append([polarity[name], subjectivity[name]])
        x_rep.append([polarity[name], subjectivity[name]])
        y.append(1)
    X = np.array(X)
    x_dem = np.array(x_dem)
    x_rep = np.array(x_rep)
    y = np.array(y)
    h = .001

    clf = neighbors.KNeighborsClassifier(k_neighbors, weights='distance')
    clf.fit(X, y)

    x_min, x_max = X[:, 0].min() - 0.05, X[:, 0].max() + 0.05
    y_min, y_max = X[:, 1].min() - 0.05, X[:, 1].max() + 0.05
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_back)

    plt.scatter(x_dem[:, 0],
                x_dem[:, 1],
                cmap=cmap_scatter_dem,
                marker='o',
                edgecolors='black',
                linewidths=1,
                label='Democrats')
    plt.scatter(x_rep[:, 0],
                x_rep[:, 1],
                cmap=cmap_scatter_rep,
                marker='^',
                edgecolors='black',
                linewidths=1,
                label='Republicans')

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title('Democrats vs Republicans')
    plt.xlabel('Polarity')
    plt.ylabel('Subjectivity')
    plt.legend()
    plt.show()
df['nbh'] = le.transform(df.neighbourhood_cleansed)

tfid = TfidfVectorizer()
ttext = tfid.fit_transform(df['comments'])

#print top_feats_in_doc(ttext, tfid.get_feature_names(), 1, 10)
#sys.exit(0)

print "%d %d" % (ttext.shape[0], len(df['nbh']))
X_train, X_test, y_train, y_test = \
    train_test_split(ttext, df['nbh'],
    test_size=0.2, random_state=1)

rs = 1
ests = [
    neighbors.KNeighborsClassifier(3),
    RandomForestClassifier(random_state=rs)
]

ests_labels = np.array(['KNeighbors', 'RandomForest'])

for i, e in enumerate(ests):
    e.fit(X_train, y_train)
    this_score = metrics.accuracy_score(y_test, e.predict(X_test))
    scorestr = "%s: Accuracy Score %0.2f" % (ests_labels[i], this_score)
    print
    print scorestr
    print "-" * len(scorestr)
    print metrics.classification_report(y_test,
                                        e.predict(X_test),
                                        target_names=le.classes_)
import pandas as pd
import os
from sklearn import neighbors, model_selection

dir = 'E:/'
titanic_train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(titanic_train.info())
print(titanic_train.columns)

X_train = titanic_train[['SibSp', 'Parch']]
y_train = titanic_train['Survived']
knn_estimator = neighbors.KNeighborsClassifier()
knn_estimator.fit(X_train, y_train)
model_selection.cross_val_score(knn_estimator,
                                X_train,
                                y_train,
                                scoring="accuracy",
                                cv=5).mean()

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(titanic_test.info())
X_test = titanic_test[['SibSp', 'Parch']]
titanic_test['Survived'] = knn_estimator.predict(X_test)
titanic_test.to_csv(os.path.join(dir, 'submission.csv'),
                    columns=['PassengerId', 'Survived'],
                    index=False)
示例#10
0
    return (yp)


# In[53]:

accuracyModel(test_separate, simple_logistic_separate)
test['predicted'] = accuracyModel(test, simple_logistic_separate)

# In[54]:

test.plot.scatter(x='predicted', y='target')

# In[59]:

classifiers = [
    neighbors.KNeighborsClassifier(20, weights='distance', n_jobs=-1),
    linear_model.LogisticRegression(C=10e10, penalty='l1', n_jobs=-1),
    RandomForestClassifier(n_estimators=1000,
                           min_samples_split=5,
                           max_depth=None,
                           n_jobs=-1),
    DecisionTreeClassifier('gini')
]

label_columns = ["Classifier", "Accuracy"]
label = pd.DataFrame(columns=label_columns)

nFolds = 8
shuffleSplit = StratifiedShuffleSplit(n_splits=nFolds,
                                      test_size=0.25,
                                      random_state=3)
示例#11
0
        x = a[xx]
        indices.append(xx)
        cdr = dataprovider.get_CDR(x)
        ll = dataprovider.retrieve_full_data(x)
        #CrossSectionalData.show_slices([ll[:,:,50]])
        if cdr == None or cdr > 1:
            continue
        feat = AlzheimerFeatures.surrounding_points_discrete_with_pos(
            ll, step, step_2, [dataprovider.get_gender(x)])
        print(len(feat))
        allfeatures1.append(regressor.predict(feat)[0:cut])
        #plt.plot(regressor.predict(feat))
        #plt.show()
        ally1.append(cdr)

    rbf_svc = neighbors.KNeighborsClassifier(n_neighbors=7)
    rbf_svc.fit(allfeatures1, ally1)

    ll = dataprovider.retrieve_full_data(56)
    error = 0
    index = 0
    for xx in range(len(a)):
        x = a[xx]
        cdr = dataprovider.get_CDR(x)
        if cdr == None or cdr > 1 or xx in indices:
            continue
        ll = dataprovider.retrieve_full_data(x)
        feat = AlzheimerFeatures.surrounding_points_discrete_with_pos(
            ll, step, step_2, [dataprovider.get_gender(x)])
        predictq = regressor.predict(feat)[:cut]
        #plt.plot(predict)
示例#12
0
def woolyTrain():
    '''Trains two models:
        1.  SVC with PCA
        2.  k nearest neighbors/
        
    Then does a prediction afterwards'''

    import pickle
    import sklearn.model_selection as model_selection
    import sklearn.decomposition as decomposition
    import sklearn.preprocessing as preprocessing
    import sklearn.neighbors as neighbors
    import sklearn.svm as svm
    import PIL
    import os
    import numpy
    import scipy.stats as stats

    d = 'Photos\\Processed'
    with open('y.dat', 'rb') as f:
        dc = pickle.load(f)

    y = []
    c = []
    X = []

    normi = stats.norm.ppf(0.5 + (1. / 6.))

    for k in dc:
        #        yi = (k['air'] - k['air_mean'])/k['air_std']
        yi = (k['precip'] - k['mean']) / k['std']

        if yi <= -normi:
            ci = 'below'
        elif yi <= normi:
            ci = 'normal'
        else:
            ci = 'above'

        im = PIL.Image.open(os.path.join(d, k['photo']))

        xi = numpy.array(im)
        h, w, nrgb = numpy.shape(xi)

        X.append(xi.flatten())
        y.append(yi)
        c.append(ci)

    #split!
    Xt, Xs, ct, cs = model_selection.train_test_split(X, c, test_size=0.10)

    #PCA.  100 components
    n = min(100, len(Xt))
    pca = decomposition.KernelPCA(n_components=n, kernel='rbf')
    pca.fit(Xt)

    Xtp = pca.transform(Xt)
    Xsp = pca.transform(Xs)

    #sklearn!  SVC
    trans = preprocessing.QuantileTransformer(output_distribution='normal')
    Xtpn = trans.fit_transform(Xtp)
    Xspn = trans.transform(Xsp)

    svc = svm.SVC(kernel='rbf', gamma='auto', C=1., probability=True)
    svc.fit(Xtpn, ct)
    sc = [svc.score(Xtpn, ct), svc.score(Xspn, cs)]  #perfect fit, duh
    print('SVC with PCA:  {:.0%} training, {:.0%} test'.format(*sc))

    #huge file for some reason, return the best model for processing
    #dump trans, svc, ct, Xtpn, and then fit in function.
    df = {'svc': svc, 'trans': trans, 'pca': pca}
    with open('precip model.dat', 'wb') as f:
        pickle.dump(df, f)

    #sklearn! Nearest neighbor
    neigh = neighbors.KNeighborsClassifier()
    neigh.fit(Xt, ct)
    sn = [neigh.score(Xt, ct), neigh.score(Xs, cs)]  #perfect fit, duh
    print('Nearest neighbor:  {:.0%} training, {:.0%} test'.format(*sn))

    print('Returning models')
    #huge file for some reason, return the best model for processing
    return svc, neigh, pca, trans
示例#13
0
print('Distance of c1 in training set:')
print('{:18s} = {:.4f}'.format('Mean', mu_c1))
print('{:18s} = {:.4f}'.format('Standard deviation', sd_c1))
print('{:18s} = {:.4f}\n'.format('Threshold', threshold_c1))

pass_rate = utils.get_rate(x_passed_s2, x_passed_s1)
print(f'Pass rate = {pass_rate * 100:.4f}%')

if pass_rate == 0:
    raise Exception('All samples are blocked by Reliability check')

# Stage 3 - Decidability
print('\n---------- Decidability ----------------')
model_knn = knn.KNeighborsClassifier(n_neighbors=k,
                                     n_jobs=-1,
                                     weights='distance')
model_knn.fit(x_train, y_train)

x_passed_s3, ind_passed_s3 = ad.check_decidability(x_passed_s2, pred_passed_s2,
                                                   model_knn)

# Print
pass_rate = utils.get_rate(x_passed_s3, x_passed_s2)
print(f'Pass rate = {pass_rate * 100:.4f}%')

if pass_rate == 0:
    raise Exception('All samples are blocked by Decidability check')

x_passed_ad = x_passed_s3
示例#14
0
def plot_mult_decision_boundary(ax,
                                X,
                                y,
                                k,
                                scaled=True,
                                title='Title',
                                xlabel='xlabel',
                                ylabel='ylabel',
                                hard_class=True):
    """Plot the decision boundary of a kNN classifier.
    
    Builds and fits a sklearn kNN classifier internally.

    X must contain only 2 continuous features.

    Function modeled on sci-kit learn example.

    Parameters
    ----------
    ax: Matplotlib axes object
        The plot to draw the data and boundary on
        
    X: numpy array
        Training data
    
    y: numpy array
        Target labels
    
    k: int
        The number of neighbors that get a vote.
        
    scaled: boolean, optional (default=True)
        If true scales the features, else uses features in original units
    
    title: string, optional (default = 'Title')
        A string for the title of the plot
    
    xlabel: string, optional (default = 'xlabel')
        A string for the label on the x-axis of the plot
    
    ylabel: string, optional (default = 'ylabel')
        A string for the label on the y-axis of the plot
    
    hard_class: boolean, optional (default = True)
        Use hard (deterministic) boundaries vs. soft (probabilistic) boundaries
    

    Returns
    -------
    None
    """
    x_mesh_step_size = 0.1
    y_mesh_step_size = 0.01

    #Hard code in colors for classes, one class in red, one in blue
    bg_colors = np.array(
        [np.array([255, 150, 150]) / 255,
         np.array([150, 150, 255]) / 255])
    cmap_light = ListedColormap(bg_colors)
    cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

    #Build a kNN classifier
    clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform')

    if scaled:
        #Build pipeline to scale features
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X, y)
    else:
        clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = 45, 85
    y_min, y_max = 2, 4

    xx, yy = np.meshgrid(np.arange(x_min, x_max, x_mesh_step_size),
                         np.arange(y_min, y_max, y_mesh_step_size))
    if hard_class:
        dec_boundary = clf.predict(np.c_[xx.ravel(),
                                         yy.ravel()]).reshape(xx.shape)
        ax.pcolormesh(xx, yy, dec_boundary, cmap=cmap_light)
        ax.scatter(X[:, 0], X[:, 1], c='black', cmap=cmap_bold)
    else:
        dec_boundary = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
        colors = dec_boundary.dot(bg_colors)
        ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        ax.imshow(colors.reshape(200, 400, 3),
                  origin="lower",
                  aspect="auto",
                  extent=(x_min, x_max, y_min, y_max))

    ax.set_title(title + ", k={0}, scaled={1}".format(k, scaled))
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xlim((x_min, x_max))
    ax.set_ylim((y_min, y_max))
#df = df.drop(df[df.label=='4u-Amantha'].index)

df.describe()

X = df[list(df.columns)[1:-1]]
y = df['label']

#apply preprocessing to X
#X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X, y)

y_predictions = clf.predict(X_test)

#for i in range(0,len(y_predictions)):
#print y_predictions[i], y_test.as_matrix()[i]

print 'Accuracy:', clf.score(X_test, y_test)

#printing the training data size for each element
print collections.Counter(y_train.factorize()[0])

#draw confusion matrix
#get all labels
#le = preprocessing.LabelEncoder()
                      lr__C = [1]
                     )

lr_grid_search = GridSearchCV(lr_pipe,
                              lr_parameters,
                              cv = cv, 
                              scoring = 'accuracy')
lr_grid_search.fit(trainData[Predictors], trainData['Survived'])


# In[76]:

kn_pipe = Pipeline(steps = [('feature_union', FeatureUnion([('pca', PCA()),
                                                            ('select_KBest', SelectKBest())
                                                           ])),
                            ('kn', neighbors.KNeighborsClassifier())
                            ])

kn_parameters = dict(feature_union__pca__n_components = [30],
                      feature_union__pca__whiten = [True],
                      feature_union__select_KBest__k = [45],
                      kn__n_neighbors = [4],
                      kn__algorithm = ['auto'],
                      kn__leaf_size = [10],
                      kn__weights = ['uniform'],
                      kn__p = [1]
                     )

kn_grid_search = GridSearchCV(kn_pipe,
                              kn_parameters,
                              cv = cv, 
示例#17
0
            max = scores.mean()
    print(x, "   ", max)
    if max > Gmax:
        Gmax = max
        kernel = x
print("the best kernel is ", kernel)

# ## KNN

# In[17]:

from sklearn import neighbors
bestk = 0
best_score = 0
for j in range(1, 60):
    classifier = neighbors.KNeighborsClassifier(n_neighbors=j)
    max = -1
    for i in range(3, 20):
        scores = kcv(classifier, features_scaled, ans, cv=i)
        if scores.mean() > max:
            max = scores.mean()
    if best_score < max:
        best_k = j
        best_score = max
print("best value of k ", best_k, " with mean score ", best_score)

# ## Naive Bayes

# In[40]:

from sklearn import naive_bayes as nb
示例#18
0
                    continue
                if i == '?':
                    cur.append(0)
                else:
                    cur.append(float(i))
        except ValueError, e:
            print "error", e, "on line", sz

        print "Processing sample " + str(sz) + " = ", cur
        if sz < TRAINING_TUPLES:
            Xnow.append(cur[:-1])  # learn more about slice on SO
            Ynow.append(cur[-1:][0])
        else:
            testTuple.append(cur)

aknn = neighbors.KNeighborsClassifier(2, weights='distance')

Xtrain = np.array(Xnow)  # Create an empty numpy array
Ytrain = np.array(Ynow)  # Result of each sample

print Xtrain, Ytrain
aknn.fit(Xtrain, Ytrain)

for tup in testTuple:
    cur = []
    for i in range(12):
        cur.append(tup[i])

    y = aknn.predict(np.array(cur))
    real = tup[-1:][0]  # take the last value
    if (y <= 1):
示例#19
0
@author: lucas
"""
import numpy as np
from sklearn import preprocessing, model_selection, neighbors
import pandas as pd

df = pd.read_csv('house-votes-84.data')
df.replace('?', -9999, inplace=True)
df.replace('republican', 0, inplace=True)
df.replace('democrat', 1, inplace=True)
df.replace('y', 1, inplace=True)
df.replace('n', 1, inplace=True)
#df.drop(['id'],1,inplace=True)
print(df)
X = np.array(df.drop(['party'], 1))
y = np.array(df['party'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2)

clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(accuracy)

example_measure = np.array([0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1])
example_measure = example_measure.reshape(1, -1)

prediction = clf.predict(example_measure)
print(prediction)
示例#20
0
 def __init__(self, data, labels, training_fraction, arguments):
     super().__init__(data, labels, training_fraction)
     # n_neighbors = int(input("Choose a number of neighbors:"))
     # self.model = n.KNeighborsClassifier(n_neighbors)
     self.model = n.KNeighborsClassifier()
示例#21
0
	testImageReconConcat = np.concatenate((testImageReconConcat, testImage), axis=1)
	for K in valuesOfK:
		output = averageFace
		for i in range(0, K):
			output = np.add(output, eigenFaces[i] * lowDimTestImage[i])
		cv2.putText(output,'K='+str(K),(0,20), font, 0.5,(255,255,255))
		testImageReconConcat = np.concatenate((testImageReconConcat, output), axis=1)
	# Display result at 2x size
	cv2.imshow("Train image reconstruction",cv2.resize(trainImageReconConcat, (0,0), fx=2, fy=2) )
	cv2.imshow("Test image reconstruction", cv2.resize(testImageReconConcat, (0,0), fx=2, fy=2))



	# classification now
	# lowDimImages contains array of image vectors of train images (320 such vectors)
	faceClassifier = neighbors.KNeighborsClassifier(n_neighbors = 3)
	y=[]
	# filling y with target class. every 8 images is 1 class, total 40 classes
	for i in range(0,40):
		faceClass = i
		for j in range(0,8):
			y.append(faceClass)
	faceClassifier.fit(lowDimImages, y)

	#classifier is now trained. we now load test images and their base truth classes

	groundTruth = []
	# filling groundTruth with target class. evergroundTruth 2 images is 1 class, total 40 classes
	for i in range(0,40):
		faceClass = i
		for j in range(0,2):
示例#22
0
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=random.seed())

# préparation de la validation croisée ...
from sklearn.cross_validation import KFold

kf = KFold(len(X_train), n_folds=10, shuffle=True)

scores = []

# pour sélectionner le paramètre optimal k
from sklearn import neighbors
for k in range(1, 30):
    score = 0
    clf = neighbors.KNeighborsClassifier(k)
    for learn, test in kf:
        X_train_val = [X_train[i] for i in learn]
        Y_train_val = [Y_train[i] for i in learn]
        clf.fit(X_train_val, Y_train_val)
        X_test_val = [X_train[i] for i in test]
        Y_test_val = [Y_train[i] for i in test]
        score = score + clf.score(X_test_val, Y_test_val)
    scores.append(score)

# valeur optimale de k :
k_opt = scores.index(max(scores)) + 1

# affichage de tous les scores. On constate :
# - que les scores correspondant aux petites valeurs de k (<= 5 ou  10) sont proches
# - que les scores diminuent sensiblement pour des valeurs de k supérieures
df = pd.read_csv(
    "C:/Users/Sangameswaran/WebstormProjects/WonderWoman/PythonScripts/crime.csv"
)
df = df.drop(['crimetime'], axis=1)
X = np.array(df.drop(['type'], 1))
y = np.array(df['type'])

elliptic = EllipticEnvelope(contamination=0.15)
elliptic.fit(X)
prediction = elliptic.predict([[latitude, longitude]])

if prediction == -1:
    possibility = "Safe zone"
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = neighbors.KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)
    val = np.array([[latitude, longitude]])
    p = clf.predict(val)
    if p == 0:
        possibility = "Sexual abuse"
    elif p == 1:
        possibility = "Robbery"
    elif p == 2:
        possibility = "Rape"
    elif p == 3:
        possibility = "Homicide"

print(possibility)
示例#24
0
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors
import pandas as pd

accuracies = []
for i in range(25):
    df = pd.read_csv('breast-cancer-wisconsin.data')
    df.replace('?', -99999, inplace=True)
    df.drop(['id'], 1, inplace=True)

    X = np.array(df.drop(['class'], 1))
    y = np.array(df['class'])

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.2)

    clf = neighbors.KNeighborsClassifier(n_jobs=-1)
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)
    accuracies.append(accuracy)

print(sum(accuracies) / len(accuracies))
# print(accuracy)

# example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [4, 2, 1, 2, 2, 2, 3, 2, 1]])
# example_measures = example_measures.reshape(len(example_measures), -1)

# prediction = clf.predict(example_measures)
# print(prediction)
示例#25
0
def compute_accuracy_nn(data_train, labels_train, data_test, labels_test, k=5):
    clf = neighbors.KNeighborsClassifier(k, weights="distance")
    return compute_accuracy_classifier(clf, data_train, labels_train,
                                       data_test, labels_test)
示例#26
0
#Classificando laranjas, macas e peras
from sklearn import neighbors
#Features: peso(g), laranja,vermelho,verde, textura(0=liso, 1=enrugado)
features = [
    [120, 0, 1, 0, 0],
    [110, 0, 1, 0, 0],
    [125, 0, 1, 0, 0],  #macas
    [150, 1, 0, 0, 1],
    [170, 1, 0, 0, 1],
    [145, 1, 0, 0, 1],  #laranjas
    [80, 0, 0, 1, 0],
    [70, 0, 0, 1, 0],
    [90, 0, 0, 1, 0]
]  #peras
labels = [
    'maca', 'maca', 'maca', 'laranja', 'laranja', 'laranja', 'pera', 'pera',
    'pera'
]
clf = neighbors.KNeighborsClassifier(3)  #numero de vizinhos
clf = clf.fit(features, labels)
print clf.predict([[90, 0, 0, 1, 0]])
print("Producing KFold indexes")
kfold = cv.KFold(amount, n_folds=10, shuffle=True)

print("Evaluating model with KFold")
counter = 0
errors = numpy.zeros(len(kfold))
wrongs = []
for train_index, test_index in kfold:
    print(counter)
    trainFeatures = [features[i] for i in train_index]
    trainClasses = [classes[i] for i in train_index]
    testFeatures = [features[i] for i in test_index]
    testClasses = [classes[i] for i in test_index]

    model = neighbors.KNeighborsClassifier(n_neighbors=1)
    model.fit(trainFeatures, trainClasses)

    predictedClasses = model.predict(testFeatures)
    errors[counter - 1] = errorRate(testClasses, predictedClasses)
    for i in range(len(testClasses)):
        if testClasses[i] != predictedClasses[i]:
            wrongs.insert(0, (predictedClasses[i], testClasses[i]))
    print(errors[counter - 1])
    counter = counter + 1

wrongDict = dict()
for pred, actual in wrongs:
    if actual in wrongDict:
        wrongDict[actual].insert(0, pred)
    else:
示例#28
0
for i in range(len(x_mean)):
    for j in range(len(x_mean[i])):
        sentence.append(str(x_mean[i][j]))
    sentence.append(str(y[i]))
    ch_dfa.write(' '.join(sentence))
    ch_dfa.write('\n')
    sentence = []
    ch_dfa.flush()
TP, FP, FN, TN = 0, 0, 0, 0
x_array = np.array(x)
y_array = np.array(y)
usx = x_array
usy = y_array
x_train, x_test, y_train, y_test = train_test_split(
    usx, usy, test_size=0.2)  #test_size: proportion of train/test data
clf = neighbors.KNeighborsClassifier(algorithm='kd_tree')
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
for i in xrange(len(y_predict)):
    if y_test[i] == 1 and y_predict[i] == 1:
        TP += 1
    if y_test[i] == 0 and y_predict[i] == 1:
        FP += 1
    if y_test[i] == 1 and y_predict[i] == 0:
        FN += 1
    if y_test[i] == 0 and y_predict[i] == 0:
        TN += 1
print 'TP: ' + str(TP)
print 'FP: ' + str(FP)
print 'FN: ' + str(FN)
print 'TN: ' + str(TN)
示例#29
0
    def algorithm_compare():
        # 数据读入
        data = []
        labels = []
        factors = Factor.objects.all()
        for factor in factors:
            temp = []
            temp.append(factor.organic_matter)
            temp.append(factor.total_nitrogen)
            temp.append(factor.available_P)
            temp.append(factor.available_K)
            data.append(temp)
            labels.append(factor.land_capability)
        x = np.array(data)
        y = np.array(labels)
        # print x
        # print y

        # 拆分训练数据与测试数据
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

        # Decision Tree Fit
        dt_clf = tree.DecisionTreeClassifier(criterion='entropy')
        # print(dt_clf)
        dt_clf.fit(x_train, y_train)
        # fname = "decisiontree.dot"
        # joblib.dump(dt_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3)

        # KNN Fit
        knn_clf = neighbors.KNeighborsClassifier(algorithm='kd_tree')
        # print(knn_clf)
        knn_clf.fit(x_train, y_train)
        # fname = "knntree.dot"
        # joblib.dump(knn_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3)

        # SVM Fit
        # C represent the request of the precision , but too large may cause overfitting
        svm_clf_linear = svm.LinearSVC(C=3.5)
        # print(svm_clf_rbf)
        svm_clf_linear.fit(x_train, y_train)
        # fname = "svmtree.dot"
        # joblib.dump(svm_clf_linear, sys.path[0] + '\\static\\model_file\\' + fname,compress=3)

        # LR Fit
        lr_clf = linear_model.LogisticRegression()
        # print(lr_clf)
        lr_clf.fit(x_train, y_train)
        # fname = "lrtree.dot"
        # joblib.dump(lr_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3)

        # 测试结果的打印
        dt_answer = dt_clf.predict(x_train)
        # print(dt_answer)
        # print(y_train)

        # 准确率与召回率
        precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, dt_clf.predict(x_test))
        dt_x_answer = dt_clf.predict(x)
        # print dt_x_answer
        # print y
        # print(classification_report(y, dt_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6']))
        dt_report = classification_report(y, dt_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6'])

        # 测试结果的打印
        knn_answer = knn_clf.predict(x_train)
        # print(x_train)
        # print(knn_answer)
        # print(y_train)
        # print(np.mean(answer == y_train))

        # 准确率与召回率
        precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, knn_clf.predict(x_test))
        knn_x_answer = knn_clf.predict(x)
        # print knn_x_answer
        # print y
        # print(classification_report(y, knn_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6']))
        knn_report = classification_report(y, knn_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6'])

        # 测试结果的打印
        svm_answer = svm_clf_linear.predict(x_train)
        # print(x_train)
        # print(svm_answer)
        # print(y_train)
        # print(np.mean(answer == y_train))

        # 准确率与召回率
        precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, svm_clf_linear.predict(x_test))
        svm_x_answer = svm_clf_linear.predict(x)
        # print svm_x_answer
        # print y
        # print(classification_report(y, svm_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6']))
        svm_report = classification_report(y, svm_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6'])

        # 测试结果的打印
        lr_answer = lr_clf.predict(x_train)
        # print(x_train)
        # print(lr_answer)
        # print(y_train)
        # print(np.mean(answer == y_train))

        # 准确率与召回率
        precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, lr_clf.predict(x_test))
        lr_x_answer = lr_clf.predict(x)
        # print lr_x_answer
        # print y
        # print(classification_report(y, lr_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6']))
        lr_report = classification_report(y, lr_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6'])

        data = []
        data.append(x)
        data.append(y)
        data.append(x_train)
        data.append(y_train)
        data.append(x_test)
        data.append(y_test)

        clf = []
        clf.append(dt_clf)
        clf.append(knn_clf)
        clf.append(svm_clf_linear)
        clf.append(lr_clf)

        clf_data = []
        clf_data.append(dt_answer)
        clf_data.append(dt_x_answer)
        clf_data.append(dt_report)
        clf_data.append(knn_answer)
        clf_data.append(knn_x_answer)
        clf_data.append(knn_report)
        clf_data.append(svm_answer)
        clf_data.append(svm_x_answer)
        clf_data.append(svm_report)
        clf_data.append(lr_answer)
        clf_data.append(lr_x_answer)
        clf_data.append(lr_report)

        return data, clf, clf_data
print(f"target_names: {iris.target_names}")
X = iris.data[:, :2]
print(f'X: {len(X)}, {type(X)}')
print(f'X head: {(X[:5])}, {type(X)}')
y = iris.target
print(f"y: {len(y)}, {type(y)}")
d = dict()
for i in y:
    if i in d:
        d[i] += 1
    else:
        d[i] = 1
print(f"y counts: {d}, {type(d)}")

n_neighbors = 15
clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors,
                                     algorithm='kd_tree')
clf.fit(X, y)
print(clf.get_params())
results = clf.predict([[4.8, 3.3]])
print(results)
nearest = clf.kneighbors([[4.8, 3.3]])
nearest_neighbors = []
for each in nearest[1][0]:
    nearest_neighbors.append((X[each], y[each]))
print(nearest_neighbors)
# [0]
# [(array([4.8, 3.4]), 0), (array([4.8, 3.4]), 0), (array([4.7, 3.2]), 0),
# (array([4.7, 3.2]), 0), (array([4.8, 3.1]), 0), (array([5. , 3.3]), 0),
# (array([4.9, 3.1]), 0), (array([4.6, 3.2]), 0), (array([4.9, 3.1]), 0),
# (array([5. , 3.2]), 0), (array([4.6, 3.4]), 0), (array([5. , 3.4]), 0),
# (array([5. , 3.4]), 0), (array([4.6, 3.1]), 0), (array([5. , 3.5]), 0)]