示例#1
0
def knnSimulate(param):
    trainSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    knnFit = KNeighborsClassifier(n_neighbors=int(param['k']))
    knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y']))
    testSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    out = OrderedDict()
    out['p'] = int(param['p'])
    out['k'] = int(param['k'])
    out['train'] = trainSet
    out['test'] = testSet
    out['resubPreds'] = knnFit.predict(trainSet['x'])
    out['resubProbs'] = knnFit.predict_proba(trainSet['x'])
    out['testPreds'] = knnFit.predict(testSet['x'])
    out['testProbs'] = knnFit.predict_proba(testSet['x'])
    out['resubTable'] = pd.crosstab(
        Series(out['resubPreds'], index=trainSet['y'].index),
        trainSet['y']
    )
    out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) /
                            (1.0 * np.sum(np.sum(out['resubTable']))))
    out['testTable'] = pd.crosstab(
        Series(out['testPreds'], index=testSet['y'].index),
        testSet['y']
    )
    out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) /
                           (1.0 * np.sum(np.sum(out['testTable']))))
    return out
示例#2
0
def get_knn(input_list, y_train, n_neighbors=301):
    X_train, X_test = input_list
    # pca = PCA(n_components=5)
    # X_train = pca.fit_transform(X_train)
    # X_test = pca.transform(X_test)

    if type(X_train) is sparse.csr.csr_matrix:
        X_train = X_train.toarray()
        X_test = X_test.toarray()

    n_samples = X_train.shape[0]
    n_categs = len(np.unique(y_train))
    kfolds = StratifiedKFold(y_train, 2)
    X_train_features = np.zeros([n_samples, n_categs])

    knn = KNN(n_neighbors=n_neighbors)
    for train, test in kfolds:
        X1 = X_train[train, :]
        y1 = y_train[train]
        X2 = X_train[test, :]
        knn.fit(X1, y1)
        X_train_features[test, :] = knn.predict_proba(X2)

    knn.fit(X_train, y_train)
    X_test_features = knn.predict_proba(X_test)

    features_manh = [X_train_features, X_test_features]
    return features_manh
示例#3
0
def knn(
    series,
    n_folds,
    clfparams,
    featureparams,
    aggregateparams,
    include,
    exclude,
    save_test_predictions,
    save_oob_predictions,
    skip_cross_validation,
    _run,
):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    data.features_to_scale.append("location")
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ["predict_{}".format(i) for i in range(3)]
    best_eps = 1e-15
    if skip_cross_validation:
        loss = 999.0
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0.0, index=y.index, columns=pred_cols)
        i = 1
        _run.info["loss"] = []
        _run.info["trainloss"] = []
        feature_importances_ = 0
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams)

            clf = KNeighborsClassifier(**clfparams)
            clf.fit(Xtr, ytr)  # , weights)
            pred.iloc[itest, :] = clf.predict_proba(Xte)
            i += 1

        def obj(x):
            return multiclass_log_loss(y.values, pred.values, eps=10.0 ** x)

        res = minimize(obj, -2.0)
        best_eps = 10 ** (res.x[0])
        loss = multiclass_log_loss(y, pred.values, eps=best_eps)
        _run.info["best_eps"] = best_eps
        _run.info["features"] = list(Xtr.columns)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = "{}_{}.csv".format(series, time)
            pred.to_csv(filename, index_label="id")
    # Optionally generate test predictions
    if save_test_predictions:
        filename = "{}_test_{}.csv".format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)

        clf = KNeighborsClassifier(**clfparams)
        clf.fit(Xtr, ytr)  # ,weights)
        predtest = pd.DataFrame(clf.predict_proba(Xte), index=yte.index, columns=pred_cols)
        predtest = np.clip(y_pred, best_eps, 1 - best_eps)
        predtest /= predtest.values.sum(axis=1)[:, np.newaxis]
        predtest.to_csv(filename, index_label="id")
    return loss
示例#4
0
class KNeighborsClassifierStep(Step):
    def __init__(self,
                 n_neighbors=5,
                 weights='uniform',
                 algorithm='auto',
                 leaf_size=30):
        super(KNeighborsClassifierStep, self).__init__()
        self._model = None
        self._n_neighbors=n_neighbors
        self._weights=weights
        self._algorithm = algorithm
        self._leaf_size = leaf_size

    def fit_transform(self):
        self._model = KNeighborsClassifier(n_neighbors=self._n_neighbors,
                                           weights=self._weights,
                                           algorithm=self._algorithm,
                                           leaf_size=self._leaf_size)
        x, y = load_svmlight(self._input_path)
        self._model.fit(x, y)
        scores = self._model.predict_proba(x)
        save_numpy_txt(scores, self._output_path)

    def transform(self, x=None):
        if not x:
            x, _ = load_svmlight(self._test_input_path)
            transformed_x = self._model.predict_proba(x)
            save_numpy_txt(transformed_x, self._output_path)
        else:
            transformed_x = self._model.predict_proba(x)
            return transformed_x

    def predict(self, features):
        return self._model.predict_proba(features)
示例#5
0
def main():
    """
    Fit models and make predictions.
    We'll use one-hot encoding to transform our categorical features
    into binary features.
    y and X will be numpy array objects.
    """
    model = KNeighborsClassifier(n=20)  # the classifier we'll use

    # === load data in memory === #
    print "loading data"
    y, X = load_data('train.csv')
    y_test, X_test = load_data('test.csv', use_labels=False)

    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)

    # if you want to create new features, you'll need to compute them
    # before the encoding, and append them to your dataset after

    # === training & metrics === #
    mean_auc = 0.0
    n = 10  # repeat the CV procedure 10 times to get more precise results
    for i in range(n):
        # for each iteration, randomly hold out 20% of the data as CV set
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            X, y, test_size=.20, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it

        # train model and make predictions
        model.fit(X_train, y_train) 
        preds = model.predict_proba(X_cv)[:, 1]

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc/n)

    # === Predictions === #
    # When making predictions, retrain the model on the whole training set
    model.fit(X, y)
    preds = model.predict_proba(X_test)[:, 1]
    filename = raw_input("Enter name for submission file: ")
    save_results(preds, filename + ".csv")
示例#6
0
文件: knn.py 项目: 9627872/OpenDL
def knn(train_data,train_label,val_data,val_label,test_data,name = "knn_submission.csv"):
	print "Start training KNN Classifier..."
	knnClf = KNeighborsClassifier(n_neighbors=20)
	knnClf.fit(train_data,train_label)
	#evaluate on validation set
	val_pred_label = knnClf.predict_proba(val_data)
	logloss = preprocess.evaluation(val_label,val_pred_label)
	print "logloss of validation set:",logloss

	print "Start classify test set..."
	test_label = knnClf.predict_proba(test_data)
	preprocess.saveResult(test_label,filename = name)
示例#7
0
文件: module4_knn.py 项目: mircean/ML
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 5).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 462.0
    df_cell_train.loc[:,'y'] *= 975.0
    df_cell_test.loc[:,'x'] *= 462.0
    df_cell_test.loc[:,'y'] *= 975.0

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    
    #Applying the classifier, ct = 5.3 #5.1282
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.2).astype(int), 
                            weights=calculate_distance,metric='manhattan',n_jobs=2)
    clf.fit(X, y)
    y_pred = clf.predict_proba(df_cell_test.values)
    ##1
    #pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:n_topx]) 
    
    return pred_labels, row_ids
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
def main_process():
    data_dict = parse_txt()
    x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict)
    print 'data counts', len(x_data), len(y_data)
    print 'zone names counts', places_cnt
    print 'path counts', len(path_int_dict)

    # start to train, change list type to numpy.array
    x_data = np.array(x_data)
    y_data = np.array(y_data)

    knn = KNeighborsClassifier()

    indices = np.random.permutation(len(x_data))
    x_train = x_data
    y_train = y_data
    x_test = x_data[indices[-TEST_DATA_ROWS:]]
    y_test = y_data[indices[-TEST_DATA_ROWS:]]
    knn.fit(x_train, y_train)  # work

    test_result = knn.predict(x_test)  # test
    proba_test_result = knn.predict_proba(x_test)

    # no duplicate value, so reverse this dictionary
    int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys()))

    print 'predict result:', test_result
    print [int_path_dict[x] for x in test_result]  # test result
示例#10
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
示例#11
0
def process_1_grid(df_train, df_test, grid, threshold):

	# Creating data with the particular grid id.
	df_train_1_grid = df_train.loc[df_train.grid_num == grid]
	df_test_1_grid = df_test.loc[df_test.grid_num == grid]
	place_counts = df_train_1_grid.place_id.value_counts()
	mask = (place_counts[df_train_1_grid.place_id.values] >= threshold).values
	df_train_1_grid = df_train_1_grid.loc[mask]
	# Label Encoding
	le = LabelEncoder()
	labels = le.fit_transform(df_train_1_grid.place_id.values)
	
	# Computing train and test feature data for grid grid.
	X = df_train_1_grid.drop(['place_id','grid_num'], axis=1).values.astype(int)
	X_test = df_test_1_grid.drop(['grid_num'], axis=1).values.astype(int)
	row_id = df_test_1_grid.index
	
	# KNN Classifier 
	clf = KNeighborsClassifier(n_neighbors=20, weights= 'distance', metric='manhattan')
	#clf = GaussianNB()
	# Training of the classifier
	#clf = XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
	clf.fit(X,labels)

	
	# Predicting probabilities for each of the label for test data.
	prob_y = clf.predict_proba(X_test)
	
	# Transforming back to labels from One hot Encoding
	pred_labels = le.inverse_transform(np.argsort(prob_y, axis=1)[:,::-1][:,:3])
	return pred_labels, row_id
示例#12
0
文件: knn.py 项目: mkdmkk/infaas
class PatternBasedDiagnosis:
    """
    Pattern Based Diagnosis with Decision Tree
    """

    __slots__ = [
        "model"
    ]

    def __init__(self):
        pass

    def train(self, data, labels):
        """
        Train the decision tree with the training data
        :param data:
        :param labels:
        :return:
        """
        print('Training Data: %s' % (data))
        print('Training Labels: %s' % (labels))
        self.model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree')
        self.model = self.model.fit(data, labels)

    def eval(self, obs):
        print('Testing Result: %s; %s' % (self.model.predict(obs), self.model.predict_proba(obs)))
        # print('Testing Result: %s' % self.model.predict(obs))
	 def test(self):
            X, y = self.dataMat,self.labelMat
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.6, random_state=12)
            #clf = RandomForestClassifier(max_depth=6,min_samples_split=9,min_samples_leaf=15,n_estimators=5)
            #clf = DBN([X.shape[1], 24, 2],scales=0.5,learn_rates=0.02,learn_rate_decays = 0.95, learn_rate_minimums =0.001,epochs=500,l2_costs = 0.02*0.031, dropouts=0.2,verbose=0)
            #cvnum = ShuffleSplit(2013,n_iter=10,test_size=0.6,train_size=0.4,random_state=0)
            print "****************************************************************"
            #clf = GaussianNB()
            #clf = LDA()
            rbm = BernoulliRBM(batch_size=0, learning_rate=0.1, n_components=12, n_iter=100,random_state=None, verbose=0)
            clf= KNeighborsClassifier(n_neighbors=20, algorithm='auto',leaf_size=30)
            #clf = linear_model.LogisticRegression(C=1e2)
            logistic = linear_model.LogisticRegression(C=100)
            lasso = linear_model.Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,normalize=False, positive=False, precompute=False, random_state=None,selection='cyclic', tol=0.0001, warm_start=False)
            #classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
            clf.fit(X_train, y_train);
            scores = cross_val_score(clf,X,y,cv=3,scoring='roc_auc')
            y_pred = clf.predict(X_test);
            y_predprob = clf.predict_proba(X_test);
            prf=precision_recall_fscore_support(y_test, y_pred, average='binary')
            print ("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
            print  classification_report(y_test,y_pred)
            print 'The accuracy is: ', accuracy_score(y_test,y_pred)
            print 'The log loss is:', log_loss(y_test, y_predprob)
            print 'The ROC score is:', roc_auc_score(y_test,y_predprob[:,1])
示例#14
0
class Predictor(object):
    def __init__(self, n_neighbors=8, slackbot=None):
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights="distance")

        skill_data = SkillData()
        data_X, data_y = SkillDataLoader().make_data_set(skill_data.q)
        self.knn.fit(data_X, data_y)

        if slackbot is None:
            self.slackbot = SlackerAdapter()
        else:
            self.slackbot = slackbot

    def predict_skill(self):
        data_loader = SkillDataLoader()
        test_x = data_loader.make_X()

        predict = self.knn.predict(test_x)[0]
        confidence = max(self.knn.predict_proba(test_x)[0])
        description = " ".join(Skill.classes[predict][0])
        func_name = Skill.classes[predict][1]

        if confidence >= 0.85:
            runner = FunctionRunner()
            params = runner.filter_f_params(description, func_name)

            self.slackbot.send_message(
                text=MsgResource.PREDICT_RESULT(description=description)
            )
            runner.load_function(func_name=func_name, params=params, day_of_week=[0])
        else:
            functions = Functions(self.slackbot)
            functions.remind_idea()
def knn_predict(X_train, y_train, X_test, k=20):
    from sklearn.neighbors import KNeighborsClassifier

    neigh = KNeighborsClassifier(n_neighbors=k, algorithm='brute', weights='distance', metric='cosine')
    neigh.fit(X_train, y_train)

    return neigh.predict_proba(X_test)[:, 1]
示例#16
0
def train_data():
    x_data, y_data, zone_cnt, zone_int_dict = get_x_y_data()

    knn = KNeighborsClassifier()

    indices = np.random.permutation(len(x_data))
    x_train = x_data
    y_train = y_data
    x_test = x_data[indices[-TEST_DATA_ROWS:]]
    y_test = y_data[indices[-TEST_DATA_ROWS:]]
    knn.fit(x_train, y_train)  # start training
    print 'training data count:', len(indices), ' number of zones:', zone_cnt
    test_result = knn.predict(x_test)  # test
    prob_test_result = knn.predict_proba(x_test)
    print prob_test_result

    # no duplicate value, so reverse this dictionary
    int_zone_dict = dict(zip(zone_int_dict.values(), zone_int_dict.keys()))

    print 'predict result:', test_result, [int_zone_dict[x] for x in test_result]  # test result
    print 'ground truth:', y_test, [int_zone_dict[x] for x in y_test]  # ground truth
    cnt = 0
    for i in range(TEST_DATA_ROWS):
        if test_result[i] == y_test[i]:
            cnt += 1
    print 'accurate rate', cnt * 1.0 / TEST_DATA_ROWS

    from sklearn.cross_validation import cross_val_score
    print cross_val_score(knn, x_train, y_train)
def process_one_cell(cell_train, cell_test, fw, th, n_neighbors):
    
    # Remove infrequent places
    cell_train = remove_infrequent_places(cell_train, th)
    
    # Store row_ids for test
    row_ids = cell_test[:, -1].flatten().astype(np.int32)
    cell_test = cell_test[:, :-1]
    
    # Preparing data
    y = cell_train[:, -1].flatten().astype(np.int64)
    X = cell_train[:, :-1]
    
    #Applying the classifier
    cte = 5.8
    n_neighbors = int((y.size ** 0.5) / cte)
    clf = KNeighborsClassifier(n_neighbors=n_neighbors,
                            weights=calculate_distance, p=1, 
                            n_jobs=2, leaf_size=15)
    clf.fit(X, y)
    y_pred = clf.predict_proba(cell_test)
    y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1]
    pred_labels = clf.classes_[y_pred_labels]
    cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) 
    
    return cell_pred
示例#18
0
def process_one_cell(df_cell_train, df_cell_test):
    # Remove infrequent places
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 5).values
    df_cell_train = df_cell_train.loc[mask].copy()

    df_cell_train['x']=df_cell_train['x']*22
    df_cell_train['y']=df_cell_train['y']*52
    df_cell_test['x']=df_cell_test['x']*22
    df_cell_test['y']=df_cell_test['y']*52
      
    # Store row_ids for test
    row_ids = df_cell_test.index
    
    # Preparing data
    y = df_cell_train.place_id.values
    X = df_cell_train.drop(['place_id'], axis=1).values
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.83).astype(int),
                            weights=calculate_distance, p=1, 
                            n_jobs=2, leaf_size=20)
    clf.fit(X, y)
    y_pred = clf.predict_proba(df_cell_test.values)
    y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1]
    pred_labels = clf.classes_[y_pred_labels]
    cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) 
    
    return cell_pred
示例#19
0
def knnSimulate(param, nFold=5):
    trainSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    knnClass = KNeighborsClassifier(n_neighbors=int(param['k']))
    cvAccs = cross_val_score(estimator = knnClass,
                             X = np.array(trainSet['x']),
                             y = np.array(trainSet['y']),
                             cv = nFold)
    knnClass.fit(np.array(trainSet['x']), np.array(trainSet['y']))
    testSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    out = OrderedDict()
    out['p'] = param['p']
    out['k'] = param['k']
    out['train'] = trainSet
    out['test'] = testSet
    out['testPreds'] = knnClass.predict(testSet['x'])
    out['testProbs'] = knnClass.predict_proba(testSet['x'])
    out['cvAccuracy'] = np.mean(cvAccs)
    out['testTable'] = pandas.crosstab(
        Series(out['testPreds'], index=testSet['y'].index),
        testSet['y']
    )
    out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) /
                           (1.0 * np.sum(np.sum(out['testTable']))))
    return out
示例#20
0
    def onstartButton(self):

        cap = cv2.VideoCapture(str(self.file_name))

        if self.isfileWorking == False and self.ishasFile == True:
            self.ishasFile = False
            self.startButton.setText("Close")

            # cap = cv2.VideoCapture(str(self.file_name))

            self.isfileWorking = True
            data=spio.loadmat("openface_fea.mat")
            X=data['feature']
            id=data['id'].astype(int)-1
            Y=id[0,:]
            name=list(set(data['names']))
            name.sort()
            print("***Train knn classifier***")
            knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2)
            knn.fit(X,Y)

            success,frame = cap.read()

            while success and self.isfileWorking :
            	start=time.time()
                success, frame = cap.read() 
                
                if success:
                    img=frame.copy()
                   
                    bb,rep=getRep(img)
                    if bb is None:
                        print "Can't find any face in this picture"
                    else:
                        if rep is 0:
                            print "Get rep failed..."
                        else:
                            rep=np.reshape(rep,(1,128))
                            idx=knn.predict(rep)
                            # print("label is {} ".format(idx))
                            proba=knn.predict_proba(rep)
                            actor=name[idx]
                            self.namelineEdit.setText(actor)
                            self.timelineEdit.setText(str(round(time.time()-start,3)))
                            self.confidencelineEdit.setText(str(round(max(proba[0]),2)))
                            # print("Proba is {} ".format(proba))
                            
                            

                            draw_dlib_rects(frame,bb,actor,(0,255,0))
                    image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped()
                    pixmap = QtGui.QPixmap.fromImage(image)
                    self.showlabel.setPixmap(pixmap)
                    k = cv2.waitKey(5)
        else:
            self.ishasFile = False
            self.startButton.setText("Start")
            self.isfileWorking = False
            cap.release()
            self.showlabel.clear()
class NearestNeighborsPredictor(PredictorBase):
    '''
    Uses k-nearest neighbors.
    '''

    def __init__(self, animal_type):
        self.animal_type = animal_type
        if self.animal_type == "Cat":
            args = {'n_neighbors': 20}
        elif self.animal_type == "Dog":
            args = {'n_neighbors': 40}
        else:
            raise RuntimeError("Incorrect animal type")
        self.clf = KNeighborsClassifier(**args)

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def find_best_params(self):
        parameters = {'n_neighbors': [5, 10, 20, 40, 60]}
        knn = KNeighborsClassifier()
        clf = grid_search.GridSearchCV(knn, parameters)
        train_data = get_data('../data/train.csv')
        train_data = select_features(train_data, self.animal_type)
        X = train_data.drop(['OutcomeType'], axis=1)
        y = train_data['OutcomeType']
        clf.fit(X, y)
        print clf.best_params_
    def performance(x_train, y_train, x_test, y_test,
                    algorithm, n_neighbors=None, n_estimators=None, max_features=None,
                    kernel=None, C=None, gamma=None, degree=None, coef0=None):
        # fit the model
        if algorithm == 'k-nn':
            model = KNeighborsClassifier(n_neighbors=int(n_neighbors))
            model.fit(x_train, y_train)
        elif algorithm == 'SVM':
            model = train_svm(x_train, y_train, kernel, C, gamma, degree, coef0)
        elif algorithm == 'naive-bayes':
            model = GaussianNB()
            model.fit(x_train, y_train)
        elif algorithm == 'random-forest':
            model = RandomForestClassifier(n_estimators=int(n_estimators),
                                           max_features=int(max_features))
            model.fit(x_train, y_train)
        else:
            raise ArgumentError('Unknown algorithm: %s' % algorithm)

        # predict the test set
        if algorithm == 'SVM':
            predictions = model.decision_function(x_test)
        else:
            predictions = model.predict_proba(x_test)[:, 1]

        return optunity.metrics.roc_auc(y_test, predictions, positive=True)
示例#23
0
def main():


  # trainFeature = genfromtxt('trainF2.csv', delimiter=',')
  # trainLabel = genfromtxt('trainLabel100.csv', delimiter='\n')
  # testFeature = genfromtxt('test2Feature.csv', delimiter=',')
  trainFeature = genfromtxt('trainFeature.csv', delimiter=',')[0::5]
  trainLabel = genfromtxt('trainLabel.csv', delimiter='\n')[0::5]
  testFeature = genfromtxt('testFeature.csv', delimiter=',')

  time_start = time.clock()
  clf = KNeighborsClassifier(n_jobs=2)
  clf.fit(trainFeature, trainLabel)

  time_elapsed = (time.clock() - time_start)
  print "build model time = "+str(time_elapsed)
  time_start = time.clock()

  dec = clf.predict_proba(testFeature)

  time_elapsed = (time.clock() - time_start)
  print "predict time = "+str(time_elapsed)

  header = "Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS"

  print dec.shape[1]
  fmt=['%d'] + ['%1.4f'] * dec.shape[1]

  ind = [i for i in xrange(0,len(dec))]
  dec = insert(dec, 0, ind, axis=1)

  savetxt("predict_KNN.csv", dec, delimiter=",", header=header, fmt=fmt, comments="")
def PredictKNNRecent(day, neighbors):
	conn = sqlite3.connect('SQL/NBA.db')
	c = conn.cursor()

	skipDays = ['2015-10-27', '2015-10-28', '2015-10-29',
				'2015-10-30', '2015-10-31', '2015-11-01',
				'2015-11-02', '2015-11-03', '2015-11-04',
				'2015-11-05']

	if day in skipDays:
		probs = [0.5] * 15
	else:
		trainX, trainY, testX = BuildKNNRecent(day, c)
		trainX, testX = Standardize(trainX, testX)

		weights = 'uniform'
		metric = 'minkowski'; p = 2

		model = KNeighborsClassifier(n_neighbors=neighbors, weights=weights,
									 metric=metric, p=p)
		model.fit(trainX, trainY)
		probs = model.predict_proba(testX)
		probs = probs[:, 1]

		updatePrediction = 'UPDATE Game_Preds SET KNN = ? WHERE Game_ID = ?'
		updateSchedule = 'UPDATE Game_Schedule_2015 SET KNN = ? WHERE Game_ID = ?'
		SaveProbabilities(day, probs, updatePrediction, updateSchedule, c)

	conn.commit()
	conn.close()
示例#25
0
文件: kaggler3.py 项目: keguoh/Kaggle
def process_one_cell(df_cell_train, df_cell_test, fw, th, n_neighbors):
    
    # Remove infrequent places
    df_cell_train = remove_infrequent_places(df_cell_train, th).copy()
    
    # Store row_ids for test
    row_ids = df_cell_test.index
    
    # Preparing data
    y = df_cell_train.place_id.values
    X = df_cell_train.drop(['place_id'], axis=1).values
    
    #Applying the classifier
    cte = 5.8
    lsize = 12
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/cte).astype(int),
                            weights=calculate_distance, p=1, 
                            n_jobs=2, leaf_size=lsize)
    clf.fit(X, y)
    y_pred = clf.predict_proba(df_cell_test.values)
    y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1]
    pred_labels = clf.classes_[y_pred_labels]
    cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) 
    
    return cell_pred
def DecisionTreeClassifier(TrainData):
    features=['Month','Date','Year']
    season=['Fall','Spring','Summer','Winter']
    district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
    days=['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday','Wednesday']
    time=['first','second','third']
    features2 = [x for x in range(0,24)]
    Minute=[x for x in range(100,160)]
    latitude=[x for x in range(948,964)]
    longitude=[x for x in range(2070,2083)]
    features=district+Minute+features2+season+time

    train,validation= train_test_split(TrainData, test_size=0.4)

    knn = KNeighborsClassifier()
    knn.fit(train[features],train['Category'])
    KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',metric_params=None, n_jobs=1, n_neighbors=5, p=2,weights='uniform',multilabel=True)
    predicted=np.array(knn.predict_proba(validation[features]))
    model=knn.predict(validation[features])
    model1=knn.predict(train[features])

    print "Precision is ",precision_score(validation['Category'].values.tolist(),model,average='macro')
    print "Recall is ",recall_score(validation['Category'].values.tolist(),model,average='macro')
    print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model)
    print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model1)


    result=pd.DataFrame(predicted, columns=le_crime.classes_)
    result['Predicted']=model
    result.to_csv('knnProbabilities.csv', index = True, index_label = 'Id' )
示例#27
0
文件: run_model.py 项目: t36li/FINRA
def KNN(x_train,y_train,x_test, udf_kneighbors=100, do_CV=False):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.cross_validation import train_test_split
	from sklearn.metrics import roc_auc_score

	### variables may be in different scales, so mean standardize the variables ###
	### Mean Normalize variables before regression ###
	from sklearn.preprocessing import StandardScaler
	ss=StandardScaler()
	x_train=ss.fit_transform(x_train)
	x_test=ss.fit_transform(x_test)

	neigh=KNeighborsClassifier(weights='distance')	
	if do_CV:
		k_list=[25,125,255,387] #important to have odd numbers

		### Try different parameters of K for optimal value ###
		### Randomly divide training set into 80/20 split ###
		cv_score=list()		
		for k in k_list:
			neigh.n_neighbors=k
			x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_train,y_train,test_size=0.20, random_state=42)

			neigh.fit(x_train_cv,y_train_cv)
			y_pred=neigh.predict_proba(x_test_cv)[:,1]
			cv_score.append(roc_auc_score(y_test_cv,y_pred))			

		neigh.fit(x_train,y_train)
		y_pred=neigh.predict_proba(x_test)[:,1]

		print 'Cross Validation KNN Results........'
		print 'Parameters, CV_Scores'
		for i in range(len(cv_score)):
			print k_list[i], cv_score[i]
	else:
		print 'Making Prediction with optimal K neighbors...'
		neigh.n_neighbors=udf_kneighbors
		neigh.fit(x_train,y_train)
		y_pred=neigh.predict_proba(x_test)[:,1]
		print 'Writing submission file....'
		with open('KNN_Submission.csv','wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'
示例#28
0
def do_knn(which = ''):
  (trX, trY, teX) = draft(which)
  (trX, teX) = normalize(trX, teX)

  clf = KNeighborsClassifier(probabilities = True)
  clf.fit(trX, trY)
  teY = clf.predict_proba(teX)[:,1]
  return teY
示例#29
0
文件: knn.py 项目: cxlove/RPPredict
def knn_solver(train_data, train_label, validation, test, dimreduce, convertbinary):
    """
    """
    logging.info('begin to train the knn classifier')

    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test = dimreduce(train_data, train_label, validation, test)
    # print new_train_data.shape
    # train_data, validation, test = convertbinary(train_data, validation, test)

    knn = KNeighborsClassifier (algorithm = 'auto', n_neighbors = 10, p = 3)
    knn.fit (train_data , train_label)
    tools.get_auc (knn.predict_proba (validation)[:,1])
    return knn.predict_proba (test)[:,1]
示例#30
0
def setTrainDataAndMakeModel(X_train,Y_train,X_test):
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=128, p=2,
           weights='uniform')
    knn.fit(X_train,Y_train)
    ##NO USE OF CALIBRATED CV OR BAGGING
    output = knn.predict_proba(X_test)
    return output
    
示例#31
0
# In[19]:

df4 = pd.DataFrame({
    "Prediction": predictions,
    "Actual": y_test
}).reset_index(drop=True)

# In[20]:

df4.head(50)

# # Passing Real Time Feature Data for Testing on the Model.

# In[ ]:

inputs = {'country_India': 1, 'gender_male': 1, 'activity_Agriculture': 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]

test.fillna(0, inplace=True)

# In[ ]:

test1 = test.drop(['status', 'loan_amount', 'funded_amount'])

predictions = knn.predict_proba(test1.values.reshape(1, -1))
print(predictions)
simple_reg_model_y = sm.OLS(Height, Weight).fit()
simple_reg_model_y.summary()

import pandas as pd
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['a', 'b', 'c', 'd'])
print(data)

data.drop('b', axis=1)

import scikitplot as skplt
import matplotlib.pyplot as plt

skplt.metrics.plot_roc(ytest, knn.predict_proba(Xtest))
plt.show()

import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=0.1).fit(Xtrain, ytrain)
predict = mnb.predict(Xtest)
print("Test Accuracy Score is : %f" % mnb.score(Xtest, ytest))

print(mnb)

# generate confusion_matrix to check misclassification

from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, predict)
示例#33
0
def main():
    print("Enter main()")
    #==================================================================================
    #   k-NN法を用いた非線形分離問題(アヤメデータの3クラス)
    #==================================================================================

    #====================================================
    #   Pre Process(前処理)
    #====================================================
    #----------------------------------------------------
    #   read & set  data
    #----------------------------------------------------
    print("reading data")

    # scikit-learn ライブラリから iris データを読み込む
    iris = datasets.load_iris()

    # 3,4 列目の特徴量を抽出し、dat_X に保管
    dat_X = iris.data[:, [2, 3]]

    # クラスラベル(教師データ)を取得
    dat_y = iris.target

    print('Class labels:', numpy.unique(
        dat_y))  # ※多くの機械学習ライブラリクラスラベルは文字列から整数にして管理されている(最適な性能を発揮するため)
    print("finishing reading data")

    #---------------------------------------------------------------------
    # トレーニングされたモデルの性能評価を未知のデータで評価するために、
    # データセットをトレーニングデータセットとテストデータセットに分割する
    #---------------------------------------------------------------------
    # scikit-learn の cross_validation モジュールの関数 train_test_split() を用いて、70%:テストデータ, 30%:トレーニングデータに分割
    train_test = train_test_split(  # 戻り値:list
        dat_X,
        dat_y,  # 
        test_size=0.3,  # 0.0~1.0 で指定 
        random_state=0  # 
    )

    X_train = train_test[0]
    X_test = train_test[1]
    y_train = train_test[2]
    y_test = train_test[3]

    #----------------------------------------------------------------------------------------------------
    # scikit-learn の preprocessing モジュールの StandardScaler クラスを用いて、データをスケーリング
    #----------------------------------------------------------------------------------------------------
    stdScaler = StandardScaler()

    # X_train の平均値と標準偏差を計算
    stdScaler.fit(X_train)

    # 求めた平均値と標準偏差を用いて標準化
    X_train_std = stdScaler.transform(X_train)
    X_test_std = stdScaler.transform(X_test)

    # 分割したデータを行方向に結合(後で plot データ等で使用する)
    X_combined_std = numpy.vstack(
        (X_train_std, X_test_std))  # list:(X_train_std, X_test_std) で指定
    y_combined = numpy.hstack((y_train, y_test))

    # 学習データを正規化(後で plot データ等で使用する)
    dat_X_std = numpy.copy(dat_X)  # ディープコピー(参照コピーではない)
    dat_X_std[:,
              0] = (dat_X[:, 0] -
                    dat_X[:, 0].mean()) / dat_X[:, 0].std()  # 0列目全てにアクセス[:,0]
    dat_X_std[:, 1] = (dat_X[:, 1] - dat_X[:, 1].mean()) / dat_X[:, 1].std()

    #====================================================
    #   Learning Process
    #====================================================
    # classifier1 : kNN1 (k=1)
    kNN1 = KNeighborsClassifier(
        n_neighbors=1,  # k-NN法 の k 値
        p=2,  # euclidean_distance (l2) for p = 2
        metric='minkowski'  # 距離のとり方:ミンコフスキー距離
    )
    kNN1.fit(X_train_std, y_train)

    # classifier1 : kNN2 (k=5)
    kNN2 = KNeighborsClassifier(
        n_neighbors=5,  # k-NN法 の k 値
        p=2,  # euclidean_distance (l2) for p = 2
        metric='minkowski',  # 距離のとり方:ミンコフスキー距離
    )
    kNN2.fit(X_train_std, y_train)

    # classifier1 : kNN3 (k=10)
    kNN3 = KNeighborsClassifier(
        n_neighbors=10,  # k-NN法 の k 値
        p=2,  # euclidean_distance (l2) for p = 2
        metric='minkowski',  # 距離のとり方:ミンコフスキー距離
    )
    kNN3.fit(X_train_std, y_train)

    #====================================================
    #   汎化性能の評価
    #====================================================
    #-------------------------------
    # サンプルデータの plot
    #-------------------------------
    # plt.subplot(行数, 列数, 何番目のプロットか)
    plt.subplot(2, 2, 1)
    plt.grid(linestyle='-')

    # 品種 setosa のplot(赤の□)
    plt.scatter(dat_X_std[0:50, 0],
                dat_X_std[0:50, 1],
                color="red",
                edgecolor='black',
                marker="s",
                label="setosa")
    # 品種 virginica のplot(青のx)
    plt.scatter(dat_X_std[51:100, 0],
                dat_X_std[51:100, 1],
                color="blue",
                edgecolor='black',
                marker="x",
                label="virginica")
    # 品種 versicolor のplot(緑の+)
    plt.scatter(dat_X_std[101:150, 0],
                dat_X_std[101:150, 1],
                color="green",
                edgecolor='black',
                marker="+",
                label="versicolor")

    plt.title("iris data [Normalized]")  # title
    plt.xlabel("sepal length [Normalized]")  # label x-axis
    plt.ylabel("petal length [Normalized]")  # label y-axis
    plt.legend(loc="upper left")  # 凡例
    plt.tight_layout()  # グラフ同士のラベルが重ならない程度にグラフを小さくする。

    #-------------------------------
    # 識別結果&識別領域の表示
    #-------------------------------
    # classifier1 : kNN1 (k=1)
    plt.subplot(2, 2, 2)
    Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std,
                                          dat_y=y_combined,
                                          classifier=kNN1,
                                          list_test_idx=range(101, 150))
    plt.title("Idification Result (k=1)")  # titile
    plt.xlabel("sepal length [Normalized]")  # label x-axis
    plt.ylabel("petal length [Normalized]")  # label y-axis
    plt.legend(loc="upper left")  # 凡例
    plt.tight_layout()  # グラフ同士のラベルが重ならない程度にグラフを小さくする。

    # classifier1 : kNN1 (k=5)
    plt.subplot(2, 2, 3)
    Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std,
                                          dat_y=y_combined,
                                          classifier=kNN2,
                                          list_test_idx=range(101, 150))
    plt.title("Idification Result (k=5)")  # titile
    plt.xlabel("sepal length [Normalized]")  # label x-axis
    plt.ylabel("petal length [Normalized]")  # label y-axis
    plt.legend(loc="upper left")  # 凡例
    plt.tight_layout()  # グラフ同士のラベルが重ならない程度にグラフを小さくする。

    # classifier1 : kNN1 (k=10)
    plt.subplot(2, 2, 4)
    Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std,
                                          dat_y=y_combined,
                                          classifier=kNN3,
                                          list_test_idx=range(101, 150))
    plt.title("Idification Result (k=10)")  # titile
    plt.xlabel("sepal length [Normalized]")  # label x-axis
    plt.ylabel("petal length [Normalized]")  # label y-axis
    plt.legend(loc="upper left")  # 凡例
    plt.tight_layout()  # グラフ同士のラベルが重ならない程度にグラフを小さくする。

    #
    plt.savefig("./kNN_scikit-learn_1.png", dpi=300)
    plt.show()

    #-------------------------------
    # 識別率を計算&出力
    #-------------------------------
    y_predict1 = kNN1.predict(X_test_std)
    y_predict2 = kNN2.predict(X_test_std)
    y_predict3 = kNN3.predict(X_test_std)

    print("<テストデータの識別結果>")

    print("classifier1 : kNN1 (k=1)")
    # 誤分類のサンプル数を出力
    print(
        "誤識別数 [Misclassified samples] : %d" %
        (y_test != y_predict1).sum())  # %d:10進数, string % data :文字とデータ(値)の置き換え
    # 分類の正解率を出力
    print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict1))

    print("classifier1 : kNN2 (k=5)")
    # 誤分類のサンプル数を出力
    print(
        "誤識別数 [Misclassified samples] : %d" %
        (y_test != y_predict2).sum())  # %d:10進数, string % data :文字とデータ(値)の置き換え
    # 分類の正解率を出力
    print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict2))

    print("classifier3 : kNN3 (k=10)")
    # 誤分類のサンプル数を出力
    print(
        "誤識別数 [Misclassified samples] : %d" %
        (y_test != y_predict3).sum())  # %d:10進数, string % data :文字とデータ(値)の置き換え
    # 分類の正解率を出力
    print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict3))

    #--------------------------------------------------------------------------------------------------------
    # predict_proba() 関数を使用して、指定したサンプルのクラスの所属関係を予想
    # 戻り値は、サンプルが Iris-Setosa, Iris-Versicolor, Iris-Virginica に所属する確率をこの順で表している.
    #--------------------------------------------------------------------------------------------------------
    preProb = []

    # classifier1 : kNN1 (k=1)
    preProb.append(kNN1.predict_proba(X_test_std[0, :].reshape(
        1, -1)))  # 0 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN1.predict_proba(X_test_std[1, :].reshape(
        1, -1)))  # 1 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN1.predict_proba(X_test_std[2, :].reshape(
        1, -1)))  # 2 番目のテストデータを reshap でタプル化して格納

    # classifier2 : kNN1 (k=5)
    preProb.append(kNN2.predict_proba(X_test_std[0, :].reshape(
        1, -1)))  # 0 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN2.predict_proba(X_test_std[1, :].reshape(
        1, -1)))  # 1 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN2.predict_proba(X_test_std[2, :].reshape(
        1, -1)))  # 2 番目のテストデータを reshap でタプル化して格納

    # classifier3 : kNN1 (k=10)
    preProb.append(kNN3.predict_proba(X_test_std[0, :].reshape(
        1, -1)))  # 0 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN3.predict_proba(X_test_std[1, :].reshape(
        1, -1)))  # 1 番目のテストデータを reshap でタプル化して格納
    preProb.append(kNN3.predict_proba(X_test_std[2, :].reshape(
        1, -1)))  # 2 番目のテストデータを reshap でタプル化して格納

    # 各々のサンプルの所属クラス確率の出力
    print("classifier1 : kNN1 (k=1)")
    print("サンプル 0 の所属クラス確率 [%] :", preProb[0] * 100)
    print("サンプル 1 の所属クラス確率 [%] :", preProb[1] * 100)
    print("サンプル 2 の所属クラス確率 [%] :", preProb[2] * 100)

    print("classifier2 : kNN2 (k=5)")
    print("サンプル 0 の所属クラス確率 [%] :", preProb[3] * 100)
    print("サンプル 1 の所属クラス確率 [%] :", preProb[4] * 100)
    print("サンプル 2 の所属クラス確率 [%] :", preProb[5] * 100)

    print("classifier3 : kNN3 (k=10)")
    print("サンプル 0 の所属クラス確率 [%] :", preProb[6] * 100)
    print("サンプル 1 の所属クラス確率 [%] :", preProb[7] * 100)
    print("サンプル 2 の所属クラス確率 [%] :", preProb[8] * 100)

    #------------------------------------------------------------------------
    # 各々のサンプルの所属クラス確率の図示
    #------------------------------------------------------------------------
    # 現在の図をクリア
    plt.clf()

    # 所属クラスの確率を棒グラフ表示
    k = 0
    for i in range(3):
        for j in range(3):
            k += 1
            print("棒グラフ生成(複数図)", i, j, k)
            plt.subplot(3, 3, k)  # plt.subplot(行数, 列数, 何番目のプロットか)
            plt.title("samples[ %d ]" % j + " by classifier %d " %
                      (i + 1))  # title
            plt.xlabel("Varieties (Belonging class)")  # label x-axis
            plt.ylabel("probability[%]")  # label y-axis
            plt.ylim(0, 100)  # y軸の範囲(0~100)
            plt.legend(loc="upper left")  # 凡例

            # 棒グラフ
            plt.bar(left=[0, 1, 2],
                    height=preProb[k - 1][0] * 100,
                    tick_label=["Setosa", "Versicolor", "Virginica"])
            plt.tight_layout()  # グラフ同士のラベルが重ならない程度にグラフを小さくする。

    # 図の保存&表示
    plt.savefig("./kNN_scikit-learn_2.png", dpi=300)
    plt.show()
    print("Finish main()")
    return
knn_10.fit(X_train, y_train)

# Fit for k = 20
knn_20.fit(X_train, y_train)

# Predict k = 5
y_pred = knn.predict(X_train)

# Predict k = 10
y_pred_10 = knn_10.predict(X_train)

# Predict k = 20
y_pred_20  = knn_20.predict(X_train)

# Predict probability k = 5
y_pred_proba = knn.predict_proba(X_train)

# Predict probability k = 10
y_pred_proba_10 = knn_10.predict_proba(X_train)

# Predict probability k = 20
y_pred_proba_20 = knn_20.predict_proba(X_train)


# Look at the score of the model, k = 5
knn.score(X_train, y_train)

# Look at the score of the model, k = 10
knn_10.score(X_train, y_train)

#Look at the score of the model, k = 20
those parameters, make predictions on the test set, and submit those predictions.

BONUS TASK #2: Read the scikit-learn documentation for GridSearchCV to find the
shortcut for accomplishing bonus task #1.
'''

# MAIN TASK
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
from sklearn.grid_search import GridSearchCV
neighbors_range = [20, 40, 60, 80, 100]
weight_options = ['uniform', 'distance']
param_grid = dict(n_neighbors=neighbors_range, weights=weight_options)
grid = GridSearchCV(knn, param_grid, cv=5, scoring='log_loss')
grid.fit(X, y)
grid.grid_scores_
grid.best_score_
grid.best_params_

# BONUS TASK #1
knn = KNeighborsClassifier(n_neighbors=100, weights='uniform')
knn.fit(X, y)
y_prob = knn.predict_proba(test[feature_cols])[:, 1]
sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id')
sub.to_csv('sub.csv')

# BONUS TASK #2
y_prob = grid.predict_proba(test[feature_cols])[:, 1]
sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id')
sub.to_csv('sub.csv')
示例#36
0
size = comm.size
sum = 0
tmp = rank

# Creating the data
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])

start_time = time.time()
#####

if comm.rank == 0:
    # SVM
    clf = SVC(gamma='auto')
    clf.fit(X, y)
    print("SVM Prediction ", clf.predict([[-0.8, -1]]))
if comm.rank == 1:
    # Random Forest
    RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    RF.fit(X, y)
    print("RF Feature importance ", RF.feature_importances_)
    print("RF Prediction ", RF.predict([[-0.8, -1]]))
if comm.rank == 2:
    # KNN
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X, y)
    print("KNN Prediction ", neigh.predict([[-0.8, -1]]))
    print("KNN Probability ", neigh.predict_proba([[-0.8, -1]]))
if comm.rank > 2:
    print("done")
示例#37
0
#label2=np.zeros((303,1))
label1 = np.ones((1420, 1))  #Value can be changed
label2 = np.zeros((1514, 1))
label = np.append(label1, label2)
data_1, mask = lassodimension(shu, label)
X = data_1
y = label
sepscores = []
ytest = np.ones((1, 2)) * 0.5
yscore = np.ones((1, 2)) * 0.5
cv_clf = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=5)
for train, test in skf.split(X, y):
    y_train = utils.to_categorical(y[train])
    hist = cv_clf.fit(X[train], y[train])
    y_score = cv_clf.predict_proba(X[test])
    yscore = np.vstack((yscore, y_score))
    y_test = utils.to_categorical(y[test])
    ytest = np.vstack((ytest, y_test))
    fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0])
    roc_auc = auc(fpr, tpr)
    y_class = utils.categorical_probas_to_classes(y_score)
    y_test_tmp = y[test]
    acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
        len(y_class), y_class, y_test_tmp)
    sepscores.append(
        [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc])
    print(
        'SVC:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f'
        % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc))
scores = np.array(sepscores)
示例#38
0
#���ѡȡ140��������Ϊѵ�����ݼ�
iris_y_train = iris_y[indices[:-10]]
#����ѡȡ��140�������ı�ǩ��Ϊѵ�����ݼ��ı�ǩ
iris_x_test = iris_x[indices[-10:]]
#ʣ�µ�10��������Ϊ�������ݼ�
iris_y_test = iris_y[indices[-10:]]
#���Ұ�ʣ��10��������Ӧ��ǩ��Ϊ�������ݼ��ı�ǩ

knn = KNeighborsClassifier()
#����һ��knn����������
knn.fit(iris_x_train, iris_y_train)
#���øö����ѵ����������Ҫ��������������ѵ�����ݼ�����������ǩ

iris_y_predict = knn.predict(iris_x_test)
#���øö���IJ��Է�������Ҫ����һ���������������ݼ�
probility = knn.predict_proba(iris_x_test)
#����������������ڸ��ʵ�Ԥ��
neighborpoint = knn.kneighbors(iris_x_test[-1], 5, False)
#���������һ���������������������5���㣬���ص�����Щ�����������ɵ�����
score = knn.score(iris_x_test, iris_y_test, sample_weight=None)
#���øö���Ĵ�ַ����������׼ȷ��

print('iris_y_predict = ')
print(iris_y_predict)
#������ԵĽ��

print('iris_y_test = ')
print(iris_y_test)
#���ԭʼ�������ݼ�����ȷ��ǩ���Է���Ա�
print('Accuracy:', score)
#���׼ȷ�ʼ�����
示例#39
0
# In[ ]:


# KNN  Model Fitting and Performance Metrics
knn = KNeighborsClassifier(n_neighbors = 25)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn_acc_score = round(knn.score(X_train, y_train) * 100, 2)
print("***K Nearest Neighbors***")
print("Accuracy Score:", knn_acc_score)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
y_pred_prob =knn.predict_proba(X_test)[:,1]
print("ROC_AUC Score:")
print(roc_auc_score(y_test, y_pred_prob))


# **Decision Tree**

# In[ ]:


# Decision Tree Hyper parameter Tuning
param_grid = {'max_depth': np.arange(1, 20)}
decision_tree = DecisionTreeClassifier()
decision_tree_cv = GridSearchCV(decision_tree, param_grid, cv=5)
decision_tree_cv.fit(X, y)
print("best params", decision_tree_cv.best_params_)
clf = SGDClassifier(loss="hinge")
calibrated_clf = CalibratedClassifierCV(clf, cv=5, method='sigmoid')
calibrated_clf.fit(X_train, y_train)

#Decision Tree
classifiers = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifiers.fit(X_train, y_train)

#Random Forest
model = RandomForestClassifier(n_estimators=100,
                               criterion='entropy',
                               random_state=0)
model.fit(X_train, y_train)

r_probs = [0 for _ in range(len(y_test))]
knn_probs = knn.predict_proba(X_test)
classifier_probs = classifier.predict_proba(X_test)
classifiers_probs = classifiers.predict_proba(X_test)
calibrated_clf_probs = calibrated_clf.predict_proba(X_test)
gnb_probs = gnb.predict_proba(X_test)
model_probs = model.predict_proba(X_test)

knn_probs = knn_probs[:, 1]
classifier_probs = classifier_probs[:, 1]
calibrated_clf_probs = calibrated_clf_probs[:, 1]
gnb_probs = gnb_probs[:, 1]
classifiers_probs = classifiers_probs[:, 1]
model_probs = model_probs[:, 1]

from sklearn.metrics import roc_curve, roc_auc_score
r_auc = roc_auc_score(y_test, r_probs)
示例#41
0
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(
    n_neighbors=5, p=2
)  #When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2
classifier.fit(features_train, labels_train)

#Calculate Class Probabilities
probability = classifier.predict_proba(features_test)

# Predicting the class labels
labels_pred = classifier.predict(features_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_test, labels_pred)
print(cm)

#Visualization

# Plot the decision boundary. For that, we will assign a color to each

x_min, x_max = features_train[:, 0].min() - 1, features_train[:, 0].max() + 1
y_min, y_max = features_train[:, 1].min() - 1, features_train[:, 1].max() + 1
示例#42
0
def models(response_col, predicts_col):

    # Train test split
    X_train = predicts_col[:int(predicts_col.shape[0] * 0.7)]
    X_test = predicts_col[int(predicts_col.shape[0] * 0.7):]
    y_train = response_col[:int(response_col.shape[0] * 0.7)]
    y_test = response_col[int(response_col.shape[0] * 0.7):]

    # Replace nans with median
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_test.median())
    y_train = y_train.fillna(y_train.median())
    y_test = y_test.fillna(y_train.median())

    # Normalize
    normalizer = preprocessing.Normalizer(norm="l2")
    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm = normalizer.fit_transform(X_test)

    # Fit random forest model
    rf_model = RandomForestClassifier(oob_score=True, random_state=1234)
    rf_model.fit(X_train_norm, y_train)

    rf_preds = rf_model.predict(X_test_norm)

    print("Random Forest:\n", classification_report(y_test, rf_preds))

    # RF ROC plot
    model_name = "Random Forest"
    rf_probs = rf_model.predict_proba(X_test_norm)
    prob = rf_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # Logistic regression
    log_reg = LogisticRegression(max_iter=300, fit_intercept=True)
    log_reg_fit = log_reg.fit(X_train_norm, y_train)
    log_preds = log_reg_fit.predict(X_test_norm)

    print("Logistic:\n", classification_report(y_test, log_preds))

    # Logistic ROC plot
    model_name = "Logistic"
    log_probs = log_reg.predict_proba(X_test_norm)
    prob = log_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # SVM
    svm_model = svm.SVC(probability=True)
    svm_fitted = svm_model.fit(X_train_norm, y_train)
    svm_preds = svm_fitted.predict(X_test_norm)

    print("SVM:\n", classification_report(y_test, svm_preds))

    # SVM ROC plot
    model_name = "SVM"
    svm_probs = svm_model.predict_proba(X_test_norm)
    prob = svm_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # KNN
    knn_model = KNeighborsClassifier(n_neighbors=3)
    knn_fitted = knn_model.fit(X_train_norm, y_train)
    knn_preds = knn_fitted.predict(X_test_norm)

    print("KNN:\n", classification_report(y_test, knn_preds))

    # KNN ROC plot
    model_name = "K-Nearest Neighbor"
    knn_probs = knn_model.predict_proba(X_test_norm)
    prob = knn_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # Decision tree classifier
    dtc_model = DecisionTreeClassifier(random_state=1234)
    dtc_fitted = dtc_model.fit(X_train_norm, y_train)
    dtc_preds = dtc_fitted.predict(X_test_norm)

    print("Decision tree classifier:\n",
          classification_report(y_test, dtc_preds))

    # Decision Tree Classifier ROC plot
    model_name = "Decision Tree Classifier"
    dtc_probs = dtc_model.predict_proba(X_test_norm)
    prob = dtc_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # Linear discriminant analysis
    lda_model = LinearDiscriminantAnalysis()
    lda_fitted = lda_model.fit(X_train_norm, y_train)
    lda_preds = lda_fitted.predict(X_test_norm)

    print("Linear discriminant analysis:\n",
          classification_report(y_test, lda_preds))

    # Linear Discriminant Analysis ROC plot
    model_name = "Linear Discriminant Analysis"
    lda_probs = lda_model.predict_proba(X_test_norm)
    prob = lda_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # Gaussian Naive Bayes
    gnb_model = GaussianNB()
    gnb_fitted = gnb_model.fit(X_train_norm, y_train)
    gnb_preds = gnb_fitted.predict(X_test_norm)

    print("Gaussian Naive Bayes:\n", classification_report(y_test, gnb_preds))

    # Gaussian Naive Bayes ROC plot
    model_name = "Gaussian Naive Bayes"
    gnb_probs = gnb_model.predict_proba(X_test_norm)
    prob = gnb_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # XGBoost
    xg_model = xgb.XGBClassifier(
        tree_method="approx",
        predictor="cpu_predictor",
        verbosity=1,
        eval_metric=["merror", "map", "auc"],
        objective="binary:logistic",
        eta=0.3,
        n_estimators=100,
        colsample_bytree=0.95,
        max_depth=3,
        reg_alpha=0.001,
        reg_lambda=150,
        subsample=0.8,
    )

    xgb_model = xg_model.fit(X_train_norm, y_train)
    xgb_preds = xgb_model.predict(X_test_norm)

    print("XGBoost:\n", classification_report(y_test, xgb_preds))

    # XGB ROC plot
    model_name = "XGBoost"
    xgb_probs = xgb_model.predict_proba(X_test_norm)
    prob = xgb_probs[:, 1]
    auc_plot(prob, y_test, model_name)

    # Good old linear regression to get output
    # predictor = sm.add_constant(X_train)
    predictor = X_train

    logit_model = sm.Logit(y_train, predictor)
    logit_fitted = logit_model.fit()

    # ols_model = sm.OLS(y_train, predictor)
    # ols_fitted = ols_model.fit()

    print(logit_fitted.summary())
    # print(ols_fitted.summary())
    # print(ols_fitted.mse_model)
    # print(ols_fitted.mse_resid)
    # print(ols_fitted.mse_total)

    # Create performance table
    model_names = [
        "Random Forest",
        "Logistic",
        "SVM",
        "KNN",
        "Decision Trees",
        "LDA",
        "Gaussian Naive Bayes",
        "XGBoost",
    ]
    predictions = [
        rf_preds,
        log_preds,
        svm_preds,
        knn_preds,
        dtc_preds,
        lda_preds,
        gnb_preds,
        xgb_preds,
    ]

    perf_table(model_names, predictions, y_test)

    return
示例#43
0
best = result.idxmin()
print(best)
print(result[best])

neigh = KNeighborsClassifier(n_neighbors=best, weights='distance')
neigh.fit(x, y)

data = pd.read_table('objectMatrixTest.txt', sep=',', header=None)

xt = data[list(range(1, 19))]

print(xt)

xt = pd.DataFrame(scale(xt))

yt = neigh.predict_proba(xt)

lab = data[0]

with open("SubmissionKNC.csv", "w") as f:
    f.write(
        'Id,Prediction1,Prediction2,Prediction3,Prediction4,Prediction5,Prediction6,Prediction7,Prediction8,Prediction9\n'
    )
    for i in range(0, yt.shape[0]):
        f.write(lab[i])
        f.write(",")
        for j in range(0, 8):
            f.write(str(yt[i][j]))
            f.write(",")
        f.write(str(yt[i][8]))
        f.write("\n")
示例#44
0
def tune_knn(tree,
             X_train,
             X_train_feature,
             y_train,
             val_frac,
             seed,
             logger=None,
             cv=5):
    """
    Tunes KNN by choosing hyperparameters that give the best pearson
    correlation to the tree predictions.
    """
    n_neighbors_grid = [3, 5, 7, 9, 11, 13, 15, 31, 45, 61]

    if not val_frac:
        knn_clf = KNeighborsClassifier(n_neighbors=3, weights='uniform')
        knn_clf = knn_clf.fit(X_train_feature, y_train)

    tune_start = time.time()

    # select a fraction of the training data
    n_samples = int(X_train.shape[0] * val_frac)
    np.random.seed(seed)
    val_indices = np.random.choice(np.arange(X_train.shape[0]), size=n_samples)

    X_val = X_train[val_indices]
    X_val_feature = X_train_feature[val_indices]
    y_val = y_train[val_indices]

    # result containers
    results = []
    fold = 0

    # tune C
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    for train_index, test_index in skf.split(X_val_feature, y_val):
        fold += 1

        # obtain fold data
        X_val_train = X_val[train_index]
        X_val_test = X_val[test_index]
        X_val_feature_train = X_val_feature[train_index]
        X_val_feature_test = X_val_feature[test_index]
        y_val_train = y_val[train_index]

        # gridsearch n_neighbors
        correlations = []
        for n_neighbors in n_neighbors_grid:
            start = time.time()

            # fit a tree ensemble and surrogate model
            m1 = clone(tree).fit(X_val_train, y_val_train)
            m2 = KNeighborsClassifier(n_neighbors=n_neighbors,
                                      weights='uniform').fit(
                                          X_val_feature_train, y_val_train)

            # generate predictions
            m1_proba = m1.predict_proba(X_val_test)[:, 1]
            m2_proba = m2.predict_proba(X_val_feature_test)[:, 1]

            # measure correlation
            correlation = pearsonr(m1_proba, m2_proba)[0]
            correlations.append(correlation)

            if logger:
                s = '[Fold {}] n_neighbors={:<2}: {:.3f}s; corr={:.3f}'
                logger.info(
                    s.format(fold, n_neighbors,
                             time.time() - start, correlation))

        results.append(correlations)
    results = np.vstack(results).mean(axis=0)
    best_ndx = np.argmax(results)
    best_n_neighbors = n_neighbors_grid[best_ndx]

    if logger:
        logger.info('chosen n_neighbors: {}'.format(best_n_neighbors))
        logger.info('total tuning time: {:.3f}s'.format(time.time() -
                                                        tune_start))
        logger.info('training...')

    train_start = time.time()
    knn_clf = KNeighborsClassifier(n_neighbors=best_n_neighbors,
                                   weights='uniform')
    knn_clf = knn_clf.fit(X_train_feature, y_train)

    if logger:
        logger.info('total training time: {:.3f}s'.format(time.time() -
                                                          train_start))

    return knn_clf
示例#45
0
print('Naive Bayes:    |      ', precisionNB, '      | ', recallNB, '  |   ',
      f1NB)
print('Support Vector  |', '\n', 'Machines       |      ', precisionSVM,
      '    | ', recallSVM, '  |  ', f1SVM)
print('Nearest Neigh:  |      ', precisionNN, '     | ', recallNN, '  |   ',
      f1NN)
print('Nearest Neigh 2:|      ', precisionNN2, '     | ', recallNN2, '  |   ',
      f1NN2)
print('Neur Network:   |      ', precisionANN, '     | ', recallANN, '  |   ',
      f1ANN)
print('Neur Network 2: |      ', precisionANN2, '     | ', recallANN2,
      '  |   ', f1ANN2)

pr_y_test_pred_DT = clfDT.predict_proba(x_test)
pr_y_test_pred_SVM = grid.predict_proba(x_test)
pr_y_test_pred_NN = clfNN.predict_proba(x_test)
pr_y_test_pred_ANN2 = clfANN2.predict_proba(x_test)
pr_y_test_pred_NB = clfNB.predict_proba(x_test)

#clfSVM.predict_proba

#ROC curve
fprDT, tprDT, thresholdsDT = roc_curve(y_test,
                                       pr_y_test_pred_DT[:, 1],
                                       pos_label=None)
fprSVM, tprSVM, thresholdsSVM = roc_curve(y_test,
                                          pr_y_test_pred_SVM[:, 1],
                                          pos_label=None)
fprNN, tprNN, thresholdsNN = roc_curve(y_test,
                                       pr_y_test_pred_NN[:, 1],
                                       pos_label=None)
示例#46
0
#!/usr/bin/env python
# -*- coding=utf-8 -*-
__author__ = "柯博文老師 Powen Ko, www.powenko.com"

X=[[9,9],[9.2,9.2],[9.6,9.2],[9.2,9.2],[6.7,7.1],[7,7.4],[7.6,7.5],
   [7.2,10.3], [7.3,10.5], [7.2,9.2], [7.3,10.2], [7.2,9.7], [7.3,10.1], [7.3,10.1]]
y=[1,1,1,1,1,1,1,
   2,2,2,2,2,2,2]

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)
print("預測答案=",neigh.predict([[7,9]]))
print("預測樣本距離=",neigh.predict_proba([[7,9]]))   #      測試數據X的返回概率估計。
from sklearn.metrics import confusion_matrix
con = confusion_matrix(y_test, y_pred)
print(con)

#checking Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#ROC and AUC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
clf_probs = reg.predict_proba(x_test)
clf_probs = clf_probs[:, 1]
print(clf_probs)
ras = roc_auc_score(y_test, clf_probs)
print("Logistic : ROC AUC = %.3f" % (ras))
from sklearn.preprocessing import label_binarize
y = label_binarize(y_test, classes=[1, 2])
n_classes = y.shape[1]
fpr, tpr, _ = roc_curve(y, clf_probs)
plt.figure()
lw = 2
plt.plot(fpr,
         tpr,
         color="orange",
         lw=lw,
         label="ROC curve (area = %0.2f" % ras)
示例#48
0
def knn(X_train, y_train, X_test):
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    knn.fit(X_train, y_train)

    return knn.predict(X_test), knn.predict_proba(X_train), knn.predict_proba(
        X_test)
示例#49
0
# In[36]:


classifier_kn.fit(x_train, y_train)


# In[37]:


guesses = classifier_kn.predict(x_test)


# In[38]:


guesses_prob = classifier_kn.predict_proba(x_test)
guesses_prob


# In[39]:


print(accuracy_score(y_test, guesses))
print(recall_score(y_test, guesses))
print(precision_score(y_test, guesses))
print(f1_score(y_test, guesses))


# In[40]:

            X = np.asarray(features)[:data_size[j], :]
            y = np.asarray(labels)[:data_size[j]]
            # Split train and validation data set
            X_input, X_validate, y_input, y_validate = train_test_split(X, y, test_size=0.1, shuffle=True)
            # Split Train and Test data set
            X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.1, shuffle=True)
            # Optimize classifier using train and test data
            pso = PSO(knn_optimize, [classifier_neighbor_range[1]], [classifier_neighbor_range[0]],
                      fitness_minimize=False, cost_function_args=(X_input, y_input),
                      verbose=False, ndview=False, max_iteration=50)
            knn_particles, knn_global_best, knn_best_costs = pso.run()

            # Classify using test data set
            classifier = KNeighborsClassifier(n_neighbors=int(knn_global_best["position"][0]))
            classifier.fit(X_train, y_train)
            test_probs = classifier.predict_proba(X_test)
            inpsize_test.append(data_size[j])


            # Compute ROC curve and ROC area for each class of test data
            y_test_bin = np.empty((len(y_test), len(label_map)))
            for k in range(y_test_bin.shape[0]):
                arr = [0 for _ in range(len(label_map))]
                arr[labels[k]] = 1
                y_test_bin[k] = np.asarray(arr)
            print('Test Label original shape: ' + str(np.asarray(y_test).shape))
            print('Test Label binary shape: ' + str(y_test_bin.shape))
            print('Test score shape:' + str(test_probs.shape))
            fpr = dict()
            tpr = dict()
            roc_auc = dict()
                             leaf_size=30,
                             p=2,
                             metric='minkowski',
                             metric_params=None)

# In[68]:

knMod.fit(X_train, y_train)

# In[69]:

knMod.score(X_test, y_test)

# In[70]:

test_labels = knMod.predict_proba(np.array(X_test.values))[:, 1]

# In[71]:

roc_auc_score(y_test, test_labels, average='macro', sample_weight=None)

# In[72]:

glmMod = LogisticRegression(penalty='l1',
                            dual=False,
                            tol=0.0001,
                            C=1.0,
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=None,
                            random_state=None,
示例#52
0
    'time': time_list
})

resultDf['trueMove'] = resultDf['trueMove'].astype(int)
resultDf['equal'] = (resultDf.predictionSVM == resultDf.trueMove.astype(int))

print('--------------Plot ROC-AUC --------------')
from sklearn import metrics
print("NB Accuracy", metrics.accuracy_score(resultDf.trueMove, pred_list_NB))

plt.figure(figsize=(9, 7))

y_pred_proba = nb.predict_proba(X_test_array)[::, 1]
fpr, tpr, _ = metrics.roc_curve(pred_list_NB, y_pred_proba)
auc = metrics.roc_auc_score(pred_list_NB, y_pred_proba)
plt.plot(fpr, tpr, label="NB auc=" + str('% 6.3f' % auc))

y_pred_proba2 = knn.predict_proba(X_test_array)[::, 1]
fpr, tpr, _ = metrics.roc_curve(pred_list_KNN, y_pred_proba2)
auc2 = metrics.roc_auc_score(pred_list_KNN, y_pred_proba2)
plt.plot(fpr, tpr, label="KNN auc=" + str('% 6.3f' % auc2))

y_pred_proba3 = svm.predict_proba(X_test_array)[::, 1]
fpr, tpr, _ = metrics.roc_curve(pred_list_SVM, y_pred_proba3)
auc3 = metrics.roc_auc_score(pred_list_SVM, y_pred_proba3)
plt.plot(fpr, tpr, label="SVM auc=" + str('% 6.3f' % auc3))

plt.xlabel("false positive")
plt.ylabel("true positive")
plt.legend(loc=4)
plt.show()
示例#53
0
        list_probs_gp = []
        pred_gp = np.zeros(shape=(200, ))
        for i in range(200):
            list_probs_gp.append(
                fun.probabilities_gp([
                    fun.LogCP2(X_test.iloc[i].values, m_scen1_train, ld_scen1,
                               il_scen1, 0.5).value(),
                    fun.LogCP2(X_test.iloc[i].values, m_scen2_train, ld_scen2,
                               il_scen2, 0.5).value()
                ]))
            pred_gp[i] = fun.classif(list_probs_gp[-1])
        proba_gp = np.asarray(list_probs_gp)

        classifier_knn = KNeighborsClassifier(n_neighbors=10)
        classifier_knn.fit(X_train, y_train)
        proba_knn = classifier_knn.predict_proba(X_test)
        pred_knn = classifier_knn.predict(X_test)

        classifier_rf = RandomForestClassifier(n_estimators=100,
                                               max_depth=2,
                                               random_state=0)
        classifier_rf.fit(X_train, y_train)
        proba_rf = classifier_rf.predict_proba(X_test)
        pred_rf = classifier_rf.predict(X_test)

        classifier_mlp = MLPClassifier(solver='lbfgs',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(20, ),
                                       random_state=1)
        classifier_mlp.fit(X_train, y_train)
        proba_mlp = classifier_mlp.predict_proba(X_test)
示例#54
0
                                   min_df=2,
                                   max_features=1000,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(clean_data['tidy'])

train_d = tfidf[:7000, :]
test_d = tfidf[7000:, :]

x_train, x_valid, y_train, y_valid = train_test_split(
    train_d, clean_data['score'][:7000], random_state=42, test_size=0.3)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
pickle.dump(svc, open("knn_tfidf.sav", "wb"))

prediction = knn.predict_proba(x_valid)
prediction_int = prediction[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int)

print("Accuracy= ", accuracy_score(y_valid, prediction_int))
print("Precision= ", precision_score(y_valid, prediction_int))
print("F1 score= ", f1_score(y_valid, prediction_int))

test_pred = knn.predict_proba(test_d)
test_pred_int = test_pred[:, 1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
x = clean_data["score"][7000:]
clean_data["score"][7000:] = test_pred_int
submission = clean_data[['id', 'score', 'review']][7000:]
submission.to_csv('outputtfidf_KNN.csv', index=False)
示例#55
0
class KNeighborsClassifier(BaseEstimator, ClassifierMixin):
    """k nearest neighbors classifier.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use.

    weights : str or callable, optional (default = 'uniform')
        weight function used in prediction.  Possible values:

        - 'uniform' : uniform weights.  All points in each neighborhood
          are weighted equally.
        - 'distance' : weight points by the inverse of their distance.
          in this case, closer neighbors of a query point will have a
          greater influence than neighbors which are further away.
        - [callable] : a user-defined function which accepts an
          array of distances, and returns an array of the same shape
          containing the weights.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors. Ignored ff ``metric``
        is either 'dtw', 'dtw_sakoechiba', 'dtw_itakura', 'dtw_multiscale',
        'dtw_fast' or 'boss' ('brute' will be used).

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or KDTree.  This can affect the
        speed of the construction and query, as well as the memory
        required to store the tree.  The optimal value depends on the
        nature of the problem.

    metric : string or DistanceMetric object (default = 'minkowski')
        the distance metric to use for the tree.  The default metric is
        minkowski, and with p=2 is equivalent to the standard Euclidean
        metric. See the documentation of the DistanceMetric class from
        scikit-learn for a list of available metrics. For Dynamic Time
        Warping, the available metrics are 'dtw', 'dtw_sakoechiba',
        'dtw_itakura', 'dtw_multiscale', 'dtw_fast' and 'boss'.

    p : integer, optional (default = 2)
        Power parameter for the Minkowski metric. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``n_jobs=-1``, then the number of jobs is set to the number of CPU
        cores. Doesn't affect :meth:`fit` method.

    """
    def __init__(self,
                 n_neighbors=1,
                 weights='uniform',
                 algorithm='auto',
                 leaf_size=30,
                 p=2,
                 metric='minkowski',
                 metric_params=None,
                 n_jobs=1,
                 **kwargs):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.p = p
        self.metric = metric
        self.metric_params = metric_params
        self.n_jobs = n_jobs
        self.kwargs = kwargs

    def fit(self, X, y):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Training vector.

        y : array-like, shape = (n_samples,)
            Class labels for each data sample.

        Returns
        -------
        self : object

        """
        X, y = check_X_y(X, y)

        if self.metric == 'dtw':
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'dtw_classic':
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw_classic,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'dtw_sakoechiba':
            n_timestamps = X.shape[1]
            if self.metric_params is None:
                region = sakoe_chiba_band(n_timestamps)
            else:
                if 'window_size' not in self.metric_params.keys():
                    window_size = 0.1
                else:
                    window_size = self.metric_params['window_size']
                region = sakoe_chiba_band(n_timestamps, window_size)
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw_region,
                                   metric_params={'region': region},
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'dtw_itakura':
            n_timestamps = X.shape[1]
            if self.metric_params is None:
                region = itakura_parallelogram(n_timestamps)
            else:
                if 'max_slope' not in self.metric_params.keys():
                    max_slope = 2.
                else:
                    max_slope = self.metric_params['max_slope']
                region = itakura_parallelogram(n_timestamps, max_slope)
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw_region,
                                   metric_params={'region': region},
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'dtw_multiscale':
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw_multiscale,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'dtw_fast':
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=dtw_fast,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        elif self.metric == 'boss':
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm='brute',
                                   metric=boss,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        else:
            self._clf = SklearnKNN(n_neighbors=self.n_neighbors,
                                   weights=self.weights,
                                   algorithm=self.algorithm,
                                   leaf_size=self.leaf_size,
                                   p=self.p,
                                   metric=self.metric,
                                   metric_params=self.metric_params,
                                   n_jobs=self.n_jobs,
                                   **self.kwargs)

        self._clf.fit(X, y)
        return self

    def predict_proba(self, X):
        """Return probability estimates for the test data X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.

        Returns
        -------
        p : array, shape = (n_samples, n_classes)
            Probability estimates.

        """
        check_is_fitted(self, '_clf')
        return self._clf.predict_proba(X)

    def predict(self, X):
        """Predict the class labels for the provided data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        y : array-like, shape = (n_samples,)
            Class labels for each data sample.

        """
        check_is_fitted(self, '_clf')
        return self._clf.predict(X)
示例#56
0
def cross_val_default(X, y, fold_type='kfold', nr_folds=5):
    """
    Cross validation for multiple classifiers with default settings
    """

    if fold_type == 'stratified':
        kf = StratifiedKFold(n_splits=nr_folds, shuffle=True)
    else:
        kf = KFold(n_splits=nr_folds, shuffle=True)

    lr_train, lr_vals = [], []
    sv_train, sv_vals = [], []
    rf_train, rf_vals = [], []
    gb_train, gb_vals = [], []
    kn_train, kn_vals = [], []
    ens_train, ens_vals = [], []

    for train_ind, val_ind in kf.split(X, y):
        x_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        x_val, y_val = X.iloc[val_ind], y.iloc[val_ind]

        scaler = MinMaxScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_val_scaled = scaler.transform(x_val)

        lr = LogisticRegression()
        lr.fit(x_train_scaled, y_train)
        y_score_val = lr.predict_proba(x_val_scaled)[:, 1]
        y_score_train = lr.predict_proba(x_train_scaled)[:, 1]
        lr_vals.append(roc_auc_score(y_val, y_score_val))
        lr_train.append(roc_auc_score(y_train, y_score_train))

        sv = SVC(probability=True)
        sv.fit(x_train_scaled, y_train)
        y_score_val = sv.predict_proba(x_val_scaled)[:, 1]
        y_score_train = sv.predict_proba(x_train_scaled)[:, 1]
        sv_vals.append(roc_auc_score(y_val, y_score_val))
        sv_train.append(roc_auc_score(y_train, y_score_train))

        rf = RandomForestClassifier()
        rf.fit(x_train, y_train)
        y_score_val = rf.predict_proba(x_val)[:, 1]
        y_score_train = rf.predict_proba(x_train)[:, 1]
        rf_vals.append(roc_auc_score(y_val, y_score_val))
        rf_train.append(roc_auc_score(y_train, y_score_train))

        gb = GradientBoostingClassifier()
        gb.fit(x_train, y_train)
        y_score_val = gb.predict_proba(x_val)[:, 1]
        y_score_train = gb.predict_proba(x_train)[:, 1]
        gb_vals.append(roc_auc_score(y_val, y_score_val))
        gb_train.append(roc_auc_score(y_train, y_score_train))

        kn = KNeighborsClassifier()
        kn.fit(x_train_scaled, y_train)
        y_score_val = kn.predict_proba(x_val_scaled)[:, 1]
        y_score_train = kn.predict_proba(x_train_scaled)[:, 1]
        kn_vals.append(roc_auc_score(y_val, y_score_val))
        kn_train.append(roc_auc_score(y_train, y_score_train))

        voting_classifer = VotingClassifier(estimators=[('kn', kn), ('gb', gb),
                                                        ('rf', rf),
                                                        ('lr', lr)],
                                            voting='soft',
                                            n_jobs=-1)
        ens = voting_classifer.fit(x_train, y_train)
        y_score_val = ens.predict_proba(x_val)[:, 1]
        y_score_train = ens.predict_proba(x_train)[:, 1]
        ens_vals.append(roc_auc_score(y_val, y_score_val))
        ens_train.append(roc_auc_score(y_train, y_score_train))

    print('ROC_AUC scores: \n')
    print(
        f'log reg:        val {np.mean(lr_vals):.3f} +- {np.std(lr_vals):.3f} | '
        f'train {np.mean(lr_train):.3f} +- {np.std(lr_train):.3f}')
    print(
        f'random forest:  val {np.mean(rf_vals):.3f} +- {np.std(rf_vals):.3f} | '
        f'train {np.mean(rf_train):.3f} +- {np.std(rf_train):.3f}')
    print(
        f'gradient boost: val {np.mean(gb_vals):.3f} +- {np.std(gb_vals):.3f} | '
        f'train {np.mean(gb_train):.3f} +- {np.std(gb_train):.3f}')
    print(
        f'knn:            val {np.mean(kn_vals):.3f} +- {np.std(kn_vals):.3f} | '
        f'train {np.mean(kn_train):.3f} +- {np.std(kn_train):.3f}')
    print(
        f'SVC:            val {np.mean(sv_vals):.3f} +- {np.std(sv_vals):.3f} | '
        f'train {np.mean(sv_train):.3f} +- {np.std(sv_train):.3f}')
    print(
        f'ensemble:       val {np.mean(ens_vals):.3f} +- {np.std(ens_vals):.3f} | '
        f'train {np.mean(ens_train):.3f} +- {np.std(ens_train):.3f}')

    return None
示例#57
0
                        cv=cv,
                        refit=False,
                        verbose=1)
kNNgsEda.fit(featuresEda[includeRowsTrain, :], labels[includeRowsTrain],
             groups[includeRowsTrain])
bestneighbors_Eda = kNNgsEda.best_params_['n_neighbors']

knnCpredAll = np.zeros(np.shape(labels))
knnCpredAcc = np.zeros(np.shape(labels))
knnCpredEda = np.zeros(np.shape(labels))
for train, test in cv.split(featuresAll, labels, groups):
    knnCAll = KNeighborsClassifier(n_neighbors=bestneighbors_All,
                                   algorithm='auto',
                                   metric='euclidean')
    knnCAll.fit(featuresAll[train, :], labels[train])
    knnCpredAll[test] = knnCAll.predict_proba(featuresAll[test, :])[:, 1]

    knnCAcc = KNeighborsClassifier(n_neighbors=bestneighbors_Acc,
                                   algorithm='auto',
                                   metric='euclidean')
    knnCAcc.fit(featuresAcc[train, :], labels[train])
    knnCpredAcc[test] = knnCAcc.predict_proba(featuresAcc[test, :])[:, 1]

    knnCEda = KNeighborsClassifier(n_neighbors=bestneighbors_Eda,
                                   algorithm='auto',
                                   metric='euclidean')
    knnCEda.fit(featuresEda[train, :], labels[train])
    knnCpredEda[test] = knnCEda.predict_proba(featuresEda[test, :])[:, 1]

# Save the scores for further analysis
#np.save('knnCpredAllScores_UTD',knnCpredAll)
示例#58
0
plt.ylabel('misclassification rate')
pml.savefig('knnClassifyErrVsK.pdf')
plt.show()

#cross_validate
scores = []
for k in ks:
    knn = KNN(n_neighbors=k)
    score = cross_val_score(knn, x_train, y_train, cv=5)
    scores.append(1 - score.mean())
plt.figure()
plt.plot(ks, scores, 'ko-')
min_k = ks[np.argmin(scores)]
plt.plot([min_k, min_k], [0, 1.0], 'b-')
plt.xlabel('k')
plt.ylabel('misclassification rate')
plt.title('5-fold cross validation, n-train = 200')

#draw hot-map to show the probability of different class
knn = KNN(n_neighbors=10)
knn.fit(x_train, y_train)
xy_predic = knn.predict_proba(xy)
levels = np.arange(0, 1.01, 0.1)
for i in range(3):
    plt.figure()
    plt.contourf(xy_predic[:, i].ravel().reshape(200, 200), levels)
    plt.colorbar()
    plt.title('p(y=%s | data, k=10)' % (i))
    pml.savefig('knnClassifyDemo_hotmap_%s.png' % (i))
plt.show()
def Co_KNN_SVM(train_Y, train_X, test_Y, test_X, savepath=None):
    # 每次迭代,添加到对方分类器训练集的样本数
    temp_num_svm = 55
    temp_num_knn = 55

    # 迭代次数
    loop_num = 6

    # knn中的K
    K = 4

    # KNN和SVM用来测试的样本及测试的标签(不变)
    fixed_test_X = test_X.copy()
    fixed_test_Y = test_Y.copy()

    # KNN保存准确率
    accuracy_knn_list = []
    # SVM保存准确率
    accuracy_svm_list = []

    # knn训练标签和训练集特征组成的元组list
    train_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        train_Y.copy(), train_X.copy())
    # knn测试标签和测试集特征组成的元组list
    test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        test_Y.copy(), test_X.copy())
    # svm训练标签和训练集特征组成的元组list
    train_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        train_Y.copy(), train_X.copy())
    # svm测试标签和测试集特征组成的元组list
    test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        test_Y.copy(), test_X.copy())
    # 协同训练
    for h in range(1, loop_num + 1):
        print(len(train_knn_Y_X_tuple_list))
        print(len(test_knn_Y_X_tuple_list))
        print(len(train_svm_Y_X_tuple_list))
        print(len(test_svm_Y_X_tuple_list))
        # 得到svm的训练集标签和训练集的特征
        train_Y_svm_from_tuple, train_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            train_svm_Y_X_tuple_list.copy())
        # 得到knn的训练集标签和训练集的特征
        train_Y_knn_from_tuple, train_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            train_knn_Y_X_tuple_list)

        # train_X_svm_from_tuple_temp, test_X_svm_temp, train_Y_svm_from_tuple_temp, test_Y_svm_temp = train_test_split(
        #     train_X_svm_from_tuple, train_Y_svm_from_tuple, test_size=11)
        #
        # train_X_knn_from_tuple_temp, test_X_knn_temp, train_Y_knn_from_tuple_temp, test_Y_knn_temp = train_test_split(
        #     train_X_knn_from_tuple, train_Y_knn_from_tuple, test_size=11)

        # 得到svm的测试集标签和测试集的特征
        test_Y_svm_from_tuple, test_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            test_svm_Y_X_tuple_list.copy())

        # 得到knn的测试集标签和测试集的特征
        test_Y_knn_from_tuple, test_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            test_knn_Y_X_tuple_list)

        # test_X_svm_from_tuple, test_X_svm_temp, test_Y_svm_from_tuple, test_Y_svm_temp = train_test_split(
        #     test_X_svm_from_tuple, test_Y_svm_from_tuple,
        #     test_size=0.05, random_state=42)
        #
        # test_X_knn_from_tuple, test_X_knn_temp, test_Y_knn_from_tuple, test_Y_knn_temp = train_test_split(
        #     test_X_knn_from_tuple, test_Y_knn_from_tuple,
        #     test_size=0.05, random_state=42)
        # KNN计算准确率
        knn = KNeighborsClassifier(n_neighbors=K, weights='distance')
        # 训练
        knn.fit(train_X_knn_from_tuple, train_Y_knn_from_tuple)
        # 获得准确率
        #accuracy_knn = knn.score(fixed_test_X, fixed_test_Y)
        accuracy_knn = knn.score(test_X_knn_from_tuple, test_Y_knn_from_tuple)
        accuracy_knn_list.append(accuracy_knn * 100)

        print("预测结果(KNN)")
        print(h)
        print(accuracy_knn)

        # svm计算准确率
        svc = SVC(C=15, kernel='rbf', degree=3, gamma=2, probability=True)
        # 训练
        svc.fit(train_X_svm_from_tuple, train_Y_svm_from_tuple)
        # 获得准确率
        #accuracy_svm = svc.score(fixed_test_X, fixed_test_Y)
        accuracy_svm = svc.score(test_X_svm_from_tuple, test_Y_svm_from_tuple)
        accuracy_svm_list.append(accuracy_svm * 100)

        print("预测结果(SVM)")
        print(h)
        print(accuracy_svm)

        if h == loop_num:
            break
        # KNN和SVM半监督训练过程
        # ---------------------------------KNN测试样本预测和置信度计算过程 ----------------------------------
        # 根据模型,预测样本
        # 获得预测可能性
        probility_knn = knn.predict_proba(test_X_knn_from_tuple)
        # knn的置信list
        confidence_knn_list = []
        for i in range(0, probility_knn.shape[0]):
            probility_knn_temp = probility_knn[i]
            confidence_knn_list.append(
                utilities.get_confidence_knn(probility_knn_temp.copy()))

        # 获得预测标签
        predict_Y_knn = knn.predict(test_X_knn_from_tuple)

        # ---------------------------------SVM测试样本预测和置信度计算过程 ----------------------------------
        # 根据模型,预测样本
        # 获得预测可能性
        probility_svm = svc.predict_proba(test_X_svm_from_tuple)

        # svm的置信list
        confidence_svm_list = []
        for i in range(0, probility_svm.shape[0]):
            probility_svm_temp = probility_svm[i]
            confidence_svm_list.append(
                utilities.get_confidence_svm(probility_svm_temp.copy()))

        # 获得预测标签
        predict_Y_svm = svc.predict(test_X_svm_from_tuple)

        # KNN和SVM伪标签添加过程
        # ---------------------------------------KNN---------------------------------------------
        index_svm_label_high_confidence = utilities.get_confidence_svm_index(
            confidence_svm_list.copy(), predict_Y_svm.copy(),
            predict_Y_knn.copy(), temp_num_svm)

        temp_test_X_svm = []
        temp_test_Y_svm = []

        for i in index_svm_label_high_confidence:
            temp_test_X_svm.append(test_X_svm_from_tuple[i])
            temp_test_Y_svm.append(predict_Y_svm[i])

        temp_test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
            temp_test_Y_svm.copy(), temp_test_X_svm.copy())
        # 把svm的置信度较高的样本加入到knn的训练集中
        train_knn_Y_X_tuple_list.extend(temp_test_svm_Y_X_tuple_list)

        # 获取新的测试样本
        index_all_test_svm_Y_X_tuple_list = np.arange(
            0, len(test_svm_Y_X_tuple_list))
        diff_index_test_svm_Y_X_tuple_list = np.setdiff1d(
            index_all_test_svm_Y_X_tuple_list,
            np.array(index_svm_label_high_confidence))
        diff_test_svm_Y_X_tuple_list = []
        for i in diff_index_test_svm_Y_X_tuple_list:
            diff_test_svm_Y_X_tuple_list.append(test_svm_Y_X_tuple_list[i])
        test_svm_Y_X_tuple_list = diff_test_svm_Y_X_tuple_list

        # ---------------------------------------SVM---------------------------------------------
        index_knn_label_high_confidence = utilities.get_confidence_knn_index(
            confidence_knn_list.copy(), predict_Y_svm.copy(),
            predict_Y_knn.copy(), temp_num_knn)

        temp_test_X_knn = []
        temp_test_Y_knn = []

        for i in index_knn_label_high_confidence:
            temp_test_X_knn.append(test_X_knn_from_tuple[i])
            temp_test_Y_knn.append(predict_Y_knn[i])

        temp_test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
            temp_test_Y_knn.copy(), temp_test_X_knn.copy())
        # 把knn的置信度较高的样本加入到svm的训练集中
        train_svm_Y_X_tuple_list.extend(temp_test_knn_Y_X_tuple_list)
        # 获取新的测试样本
        index_all_test_knn_Y_X_tuple_list = np.arange(
            0, len(test_knn_Y_X_tuple_list))
        diff_index_test_knn_Y_X_tuple_list = np.setdiff1d(
            index_all_test_knn_Y_X_tuple_list,
            np.array(index_knn_label_high_confidence))
        diff_test_knn_Y_X_tuple_list = []
        for i in diff_index_test_knn_Y_X_tuple_list:
            diff_test_knn_Y_X_tuple_list.append(test_knn_Y_X_tuple_list[i])
        test_knn_Y_X_tuple_list = diff_test_knn_Y_X_tuple_list

    print("KNN的准确率:")
    print(accuracy_knn_list)
    print("SVM的准确率:")
    print(accuracy_svm_list)
示例#60
0
num_class = 2
loo = LeaveOneOut()
sepscores = []
y_score = np.ones((1, 2)) * 0.5
y_class = np.ones((1, 1)) * 0.5
for train, test in loo.split(X):
    cv_clf = KNeighborsClassifier(n_neighbors=5)
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]
    y_sparse = utils.to_categorical(y)
    y_train_sparse = utils.to_categorical(y_train)
    y_test_sparse = utils.to_categorical(y_test)
    hist = cv_clf.fit(X_train, y_train)
    y_predict_score = cv_clf.predict_proba(X_test)
    y_predict_class = utils.categorical_probas_to_classes(y_predict_score)
    y_score = np.vstack((y_score, y_predict_score))
    y_class = np.vstack((y_class, y_predict_class))
    cv_clf = []
y_class = y_class[1:]
y_score = y_score[1:]
fpr, tpr, _ = roc_curve(y_sparse[:, 0], y_score[:, 0])
roc_auc = auc(fpr, tpr)
acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
    len(y_class), y_class, y)
result = [acc, precision, npv, sensitivity, specificity, mcc, roc_auc]
row = y_score.shape[0]
y_sparse = utils.to_categorical(y)
yscore_sum = pd.DataFrame(data=y_score)
yscore_sum.to_csv('yscore_KNN_1075_knife_no.csv')