示例#1
0
    def setUp(self):
        self.train_df, self.test_df = get_train_test_split()
        self.classes = constants["classes"]

        self.KNN = KNN(k=4, classes=self.classes)
        self.KNN.fit(self.train_df)

        self.NaiveBayes = NaiveBayes(n=3, classes=self.classes)
        self.NaiveBayes.fit(self.train_df)

        self.Linear = Linear(classes=self.classes, max_len=40)
        self.Linear.fit(self.train_df, epochs=1)

        self.W2V = W2V(classes=self.classes)
def run_model(args, X, y, ensembler = False):
    model = None
    if args['model'] == 'logistic':
        logistic = Logistic(X,y, model)
        model = logistic.train_model()
    elif args['model'] == 'knn':
        knn = KNN(X,y, model)
        model = knn.train_model()
    elif args['model'] == 'svm':
        svm = SVM(X,y, model)
        model = svm.train_model()
    elif args['model'] == 'rfa':
        rfa = RandomForest(X, y, model)
        model = rfa.train_model(ensembler)
    elif args['model'] == 'xgb':
        xgb = XGB(X, y, model)
        model = xgb.train_model(ensembler)
    elif args['model'] == 'lgbm':
        lgbm = LightGBM(X, y, model)
        model = lgbm.train_model(ensembler)
    elif args['model'] == 'catboost':
        catboost = CatBoost(X, y, model)
        model = catboost.train_model(ensembler)
    elif len(args['models']) > 1:
        models = [('', None)]* len(args['models'])
        for i in range(len(args['models'])):
            model_name = args['models'][i]
            temp_args = copy.deepcopy(args)
            temp_args['model'] = model_name 
            models[i] = (model_name, run_model(temp_args, X, y, True))

        ensembler = Ensembler(X, y, model, args['ensembler_type'])
        model = ensembler.train_model(models)
        return model
    else:
        print('\nInvalid model name :-|\n')
        exit()
    return model
示例#3
0
文件: knn.py 项目: likeand/ml
def test_knn():
    from models.knn import KNN

    x, y = np.random.randn(3, 200, 2), np.zeros([3, 200])
    x[0] += np.array([2, 2])  # 右偏移2,上偏移2
    x[1] += np.array([2, -2])  # 右偏移2,下偏移2
    y[1] = 1
    y[2] = 2
    plot_scatter(x, 'Real')

    x = x.reshape(-1, 2)
    y = y.flatten()

    # train
    knn = KNN(3)
    knn.fit(x, y)

    pred = knn.predict(x)
    plot_scatter([x[pred == i] for i in [0, 1, 2]], 'Pred')

    # print accuracy
    acc = np.sum(pred == y) / len(pred)
    print(f'Acc = {100 * acc:.2f}%')
示例#4
0
    def __init__(
        self,
        root,
        train=True,
        transform=None,
        augment_transform=None,
        target_transform=None,
        download=False,
    ):

        super(AugmentedMNIST, self).__init__(root, train, transform,
                                             target_transform, download)

        self.augment_transform = augment_transform
        self.knn = KNN()
示例#5
0
    def walk_forward_cv(self):
        """
        Runs walk-forward cross-validation, and saves cross-validation
        metrics.
        """
        for output_name in self.output_names:
            print('\t\t\t|--Prediction type: {}'.format(output_name))
            optimal_params_by_model = {}
            cv_metadata_by_model = {}
            cv_predictions_by_model = {}

            print('\t\t\t\t|--KNN Model')
            knn = KNN()
            knn.cv_params = self.cv_params
            knn.test_name = self.test_name
            knn.full_df = self.full_df
            knn.feature_names = self.feature_names
            knn.output_name = output_name
            knn.run_knn_cv()
            optimal_params_by_model['KNN'] = knn.knn_optimal_params
            cv_predictions_by_model['KNN'] = knn.knn_cv_predictions

            print('\t\t\t\t|--Elastic Net Model')
            elastic_net = ElasticNet()
            elastic_net.cv_params = self.cv_params
            elastic_net.test_name = self.test_name
            elastic_net.full_df = self.full_df
            elastic_net.feature_names = self.feature_names
            elastic_net.feature_dict = self.feature_dict
            elastic_net.output_name = output_name
            elastic_net.run_elastic_net_cv()
            optimal_params_by_model[
                'Elastic_Net'] = elastic_net.elastic_net_optimal_params
            cv_metadata_by_model['Elastic_Net'] = elastic_net.metadata
            cv_predictions_by_model[
                'Elastic_Net'] = elastic_net.elastic_net_cv_predictions

            print('\t\t\t\t|--Naive Bayes Model')
            naive_bayes = NaiveBayes()
            naive_bayes.cv_params = self.cv_params
            naive_bayes.test_name = self.test_name
            naive_bayes.full_df = self.full_df
            naive_bayes.feature_names = self.feature_names
            naive_bayes.feature_dict = self.feature_dict
            naive_bayes.output_name = output_name
            naive_bayes.run_bayes_cv()
            cv_predictions_by_model[
                'Naive_Bayes'] = naive_bayes.bayes_cv_predictions
            optimal_params_by_model[
                'Naive_Bayes'] = naive_bayes.bayes_optimal_params

            print('\t\t\t\t|--SVM Model')
            svm = SupportVectorMachine()
            svm.cv_params = self.cv_params
            svm.test_name = self.test_name
            svm.full_df = self.full_df
            svm.feature_names = self.feature_names
            svm.output_name = output_name
            svm.run_svm_cv()
            optimal_params_by_model['SVM'] = svm.svm_optimal_params
            cv_metadata_by_model['SVM'] = svm.metadata
            cv_predictions_by_model['SVM'] = svm.svm_cv_predictions

            print('\t\t\t\t|--Gaussian Process Model')
            gauss = GaussianProcess()
            gauss.cv_params = self.cv_params
            gauss.test_name = self.test_name
            gauss.full_df = self.full_df
            gauss.feature_names = self.feature_names
            gauss.feature_dict = self.feature_dict
            gauss.output_name = output_name
            gauss.run_gauss_cv()
            cv_predictions_by_model[
                'Gaussian_Process'] = gauss.gauss_cv_predictions
            cv_metadata_by_model['Gaussian_Process'] = gauss.metadata
            optimal_params_by_model[
                'Gaussian_Process'] = gauss.gauss_optimal_params

            print('\t\t\t\t|--XGBoost Model')
            xgboost = XGBoost()
            xgboost.cv_params = self.cv_params
            xgboost.test_name = self.test_name
            xgboost.full_df = self.full_df
            xgboost.feature_names = self.feature_names
            xgboost.feature_dict = self.feature_dict
            xgboost.output_name = output_name
            xgboost.run_xgboost_cv()
            optimal_params_by_model['XGBoost'] = xgboost.xgboost_optimal_params
            cv_metadata_by_model['XGBoost'] = xgboost.metadata
            cv_predictions_by_model['XGBoost'] = xgboost.xgboost_cv_predictions

            self.optimal_params_by_output[
                output_name] = optimal_params_by_model
            self.cv_metadata_by_output[output_name] = cv_metadata_by_model
            self.cv_predictions_by_output[
                output_name] = cv_predictions_by_model
示例#6
0
    def walk_forward_prediction(self):
        """
        Runs walk-forward prediction, and saves prediction metrics.
        """
        for output_name in self.output_names:
            print('\t\t\t|--Prediction type: {}'.format(output_name))
            prediction_errors_by_model = {}
            predictions_by_model = {}
            pred_metadata_by_model = {}

            print('\t\t\t\t|--KNN Model')
            knn = KNN()
            knn.pred_indices = self.pred_indices
            knn.full_df = self.full_df
            knn.feature_names = self.feature_names
            knn.output_name = output_name
            knn.knn_optimal_params = self.optimal_params_by_output[
                output_name]['KNN']
            knn.run_knn_prediction()
            prediction_errors_by_model['KNN'] = knn.knn_pred_error
            predictions_by_model['KNN'] = knn.knn_predictions

            print('\t\t\t\t|--Elastic Net Model')
            elastic_net = ElasticNet()
            elastic_net.pred_indices = self.pred_indices
            elastic_net.full_df = self.full_df
            elastic_net.feature_names = self.feature_names
            elastic_net.feature_dict = self.feature_dict
            elastic_net.output_name = output_name
            elastic_net.elastic_net_optimal_params = self.optimal_params_by_output[
                output_name]['Elastic_Net']
            elastic_net.run_elastic_net_prediction()
            prediction_errors_by_model[
                'Elastic_Net'] = elastic_net.elastic_net_pred_error
            predictions_by_model[
                'Elastic_Net'] = elastic_net.elastic_net_predictions
            pred_metadata_by_model['Elastic_Net'] = elastic_net.metadata

            print('\t\t\t\t|--Naive Bayes Model')
            naive_bayes = NaiveBayes()
            naive_bayes.pred_indices = self.pred_indices
            naive_bayes.full_df = self.full_df
            naive_bayes.feature_names = self.feature_names
            naive_bayes.output_name = output_name
            naive_bayes.run_bayes_prediction()
            prediction_errors_by_model[
                'Naive_Bayes'] = naive_bayes.bayes_pred_error
            predictions_by_model['Naive_Bayes'] = naive_bayes.bayes_predictions

            print('\t\t\t\t|--SVM Model')
            svm = SupportVectorMachine()
            svm.pred_indices = self.pred_indices
            svm.full_df = self.full_df
            svm.feature_names = self.feature_names
            svm.output_name = output_name
            svm.svm_optimal_params = self.optimal_params_by_output[
                output_name]['SVM']
            svm.run_svm_prediction()
            prediction_errors_by_model['SVM'] = svm.svm_pred_error
            predictions_by_model['SVM'] = svm.svm_predictions
            pred_metadata_by_model['SVM'] = svm.metadata

            print('\t\t\t\t|--Gaussian Process Model')
            gauss = GaussianProcess()
            gauss.pred_indices = self.pred_indices
            gauss.full_df = self.full_df
            gauss.feature_names = self.feature_names
            gauss.output_name = output_name
            gauss.run_gauss_prediction()
            prediction_errors_by_model[
                'Gaussian_Process'] = gauss.gauss_pred_error
            predictions_by_model['Gaussian_Process'] = gauss.gauss_predictions
            pred_metadata_by_model['Gaussian_Process'] = gauss.metadata

            print('\t\t\t\t|--XGBoost Model')
            xgboost = XGBoost()
            xgboost.pred_indices = self.pred_indices
            xgboost.full_df = self.full_df
            xgboost.feature_names = self.feature_names
            xgboost.feature_dict = self.feature_dict
            xgboost.output_name = output_name
            xgboost.xgboost_optimal_params = self.optimal_params_by_output[
                output_name]['XGBoost']
            xgboost.run_xgboost_prediction()
            prediction_errors_by_model['XGBoost'] = xgboost.xgboost_pred_error
            predictions_by_model['XGBoost'] = xgboost.xgboost_predictions
            pred_metadata_by_model['XGBoost'] = xgboost.metadata

            print('\t\t\t\t|--Weighted Average Model')
            weighted_average = WeightedAverage()
            weighted_average.model_names = self.model_names
            weighted_average.cv_results = self.optimal_params_by_output[
                output_name]
            weighted_average.predictions_by_model = predictions_by_model
            weighted_average.run_weighted_average_prediction()
            predictions_by_model[
                'Weighted_Average'] = weighted_average.weighted_average_predictions
            pred_metadata_by_model[
                'Weighted_Average'] = weighted_average.metadata

            self.prediction_errors_by_output[
                output_name] = prediction_errors_by_model
            self.predictions_by_output[output_name] = predictions_by_model
            self.pred_metadata_by_output[output_name] = pred_metadata_by_model
示例#7
0
from models.utils import Dataset
from models.knn import KNN, SimilarityMetrics

if __name__ == '__main__':
    dataset = Dataset.from_csv_file('data/ratings.csv')

    trainset, testset = dataset.split_into_train_and_test_sets(0.7)

    knn = KNN()
    knn.fit(trainset)
    print(knn.predict(1, 1))
示例#8
0
class ModelTests(unittest.TestCase):
    def setUp(self):
        self.train_df, self.test_df = get_train_test_split()
        self.classes = constants["classes"]

        self.KNN = KNN(k=4, classes=self.classes)
        self.KNN.fit(self.train_df)

        self.NaiveBayes = NaiveBayes(n=3, classes=self.classes)
        self.NaiveBayes.fit(self.train_df)

        self.Linear = Linear(classes=self.classes, max_len=40)
        self.Linear.fit(self.train_df, epochs=1)

        self.W2V = W2V(classes=self.classes)

    def test_knn_io(self):
        """
        Test that KNN model takes the right inputs and outputs a dictionary with all possible class
        """
        pred, output = self.KNN("BREST")
        self.assertIsInstance(output, dict)
        self.assertIn(pred, self.classes)
        for label in self.classes:
            self.assertIn(label, output.keys())

    def test_knn_output_probabilities(self):
        """
        Test that KNN model returns probabilities for each possible class
        """
        _, output = self.KNN("RADE DE BREST")
        # sums up to one
        self.assertLess(abs(sum(output.values()) - 1), 1e-3)
        # all values between 0 and 1
        for value in output.values():
            self.assertGreaterEqual(value, 0)
            self.assertLessEqual(value, 1)

    def test_knn_case_unsensitive(self):
        pred_upper, output_upper = self.KNN("BREST")
        pred_lower, output_lower = self.KNN("brest")

        self.assertEqual(pred_upper, pred_lower)
        self.assertListEqual(list(output_upper.items()),
                             list(output_lower.items()))

    def test_naive_bayes_io(self):
        """
        Test that Naive Bayes model takes the right inputs and outputs a dictionary with all possible class
        """
        pred, output = self.NaiveBayes("BREST")
        self.assertIn(pred, self.classes)
        self.assertIsInstance(output, dict)

    # def test_naive_bayes_output_probabilities(self):
    #     _, output = self.NaiveBayes("BREST")
    #     self.assertLess(abs(sum(output.values()) - 1), 1e-3)
    #     for label in self.classes:
    #         self.assertIn(label, output.keys())

    def test_linear_io(self):
        """
        Test that Linear model takes the right inputs and outputs a dictionary with all possible class
        """
        pred, output = self.Linear("BREST")
        self.assertIn(pred, self.classes)
        self.assertIsInstance(output, dict)

    def test_linear_output_probabilities(self):
        _, output = self.Linear("BREST")
        self.assertLess(abs(sum(output.values()) - 1), 1e-3)
        for label in self.classes:
            self.assertIn(label, output.keys())

    def test_w2v_io(self):
        """
        Test that Word2Vec model takes the right inputs and outputs a dictionary with all possible class
        """
        pred, output = self.W2V("BREST")
        self.assertIn(pred, self.classes)
        self.assertIsInstance(output, dict)

    def test_w2v_output_probabilities(self):
        _, output = self.W2V("BREST")
        self.assertLess(abs(sum(output.values()) - 1), 1e-3)
        for label in self.classes:
            self.assertIn(label, output.keys())
示例#9
0
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from models.knn import KNN
from src.constants import constants
from src.data import get_train_test_split, regexp_processing

model = KNN(classes=constants['classes'], k=3)
train, _ = get_train_test_split('../10_ports.csv', split=1)
model.fit(train)

app = Flask(__name__)
CORS(app)


@app.route('/predict', methods=['POST'])
def predict():
    if request.method == "POST":
        destination = request.data.decode('utf-8')

        if destination.upper() in model.destinations.keys():
            return model.destinations[destination.upper()]

        pred = model(destination)
        return pred[0]


if __name__ == '__main__':
    # Threaded option to enable multiple instances for multiple user access support
    app.run(threaded=True, port=5000)
示例#10
0
def main(args):

    # Import business dataset
    #business_df = get_POI_df(args.path+'yelp_dataset_business.json')
    
    #Import Cleaned_Toronto_Business
    business_df = get_POI_df_toronto(args.path+'Cleaned_Toronto_Business.json')
    
    # Filter business dataset by city
    #business_df = filter_by_city(business_df, city=args.city)    comment out since all toronto restaurants

    # Filter business dataset by restaurants
    #business_df = select_restaurants(business_df)     comment out since already cleaned

    # Import review dataset
    #review_df = get_review_df(args.path+'yelp_dataset_review.json')

    #Import Toronto review dataset
    review_df = get_review_df_toronto(args.path+'Cleaned_Toronto_Reviews.json')
    
    # Binarize review stars, adding a new column called review_stars_binarized
    review_df = binarized_star(review_df)

    print('review df columns',  review_df.columns)
    print('business_df columns', business_df.columns)
    
    # Merge business df and review df
    rating_df = merge_df(review_df, business_df, on_column='business_id', how_merge='inner', columns=["user_id", "business_id", "date", "review_stars", "review_text", "review_stars_binary", "categories",  "latitude", "longitude"], sort_column='date')
    
    num_cols = rating_df.business_id.nunique()
    num_rows = rating_df.user_id.nunique()
    print('unique businesses:', num_cols, 'unique users', num_rows)
    print('unique user id:', rating_df.user_id.nunique())
    
    # Assign numbers to user_id -> user_num_id
    rating_df['user_num_id'] = rating_df.user_id.astype('category').\
    cat.rename_categories(range(0, rating_df.user_id.nunique()))
    rating_df['user_num_id'] = rating_df['user_num_id'].astype('int')
    
    #Encode business_num_id
    rating_df['business_num_id'] = rating_df.business_id.astype('category').\
       cat.rename_categories(range(0, rating_df.business_id.nunique()))
    rating_df['business_num_id'] = rating_df['business_num_id'].astype('int')
    rating_df = rating_df.reset_index()
    # Get all restaurants latitude and longitude df
    POI_lat_long_df = return_POI_lat_long_df(rating_df)
    
    # Export all restaurants latitude and longitude df
    save_POI_lat_long_df(args.path+"POI_lat_long_df", POI_lat_long_df)
    
    # Get pandas user_id and business_id dict
    user_id_dict = pandas_to_dict(rating_df, "user_id", "user_num_id")

    # Export dict to disk as json
    save_user_id_dict_pickle(args.path, user_id_dict, 'user_id_dict')
    
    # Split into train and test dataset
    train_df, test_df = train_test_split(rating_df)

    # Form train set and test set
    #train_set generate UI matrix 
    '''
        Computing binary rating UI for train and test
        Computing raw rating UI for train and test 
        Combine both for entire dataset 
    '''
    
    #Getting both thresholded and raw user item review (UI) matrix 
    train_set_binary = df_to_sparse(train_df, num_rows, num_cols)
    test_set_binary = df_to_sparse(test_df, num_rows, num_cols)
    train_set_rawRating  = df_to_sparse(train_df, num_rows, num_cols, binarySetting=False)
    train_set_rawRating = df_to_sparse(train_df, num_rows, num_cols, binarySetting=False)
    entire_set_binary = train_set_binary + test_set_binary
    entire_set_raw = train_set_rawRating + train_set_rawRating
    
    #Sorting both binary, rawRating UI matrix, and entire UI matrix
    save_npz_data(args.path+"toronto_train_set_binary.npz", train_set_binary)
    save_npz_data(args.path+"toronto_test_set_binary.npz", test_set_binary)
    save_npz_data(args.path+"toronto_train_set_rawRating.npz", train_set_rawRating)
    save_npz_data(args.path+"toronto_test_set_rawRating.npz", train_set_rawRating)
    save_npz_data(args.path+"toronto_entire_set_binary.npz", entire_set_binary)
    save_npz_data(args.path+"toronto_entire_set_rawRating.npz", entire_set_raw)
    
    
    #To compute item similarity using IK 
    IK_matrix_train = get_I_K(train_df)
    IK_matrix_entire = get_I_K(rating_df)
    
    # Get item similarity
    item_IK_model_train = KNN()
    item_IK_model_train.fit(X=IK_matrix_train.T)
    sparse_item_similarity_train = item_IK_model_train.get_weights()
    save_npz_data(args.path+"item_similarity_train.npz", sparse_item_similarity_train)
    
    item_IK_model_entire = KNN()
    item_IK_model_entire.fit(X=IK_matrix_entire.T)
    sparse_item_similarity_entire = item_IK_model_entire.get_weights()
    save_npz_data(args.path+"item_similarity_entire.npz", sparse_item_similarity_entire)
    
    
    # Get user similarity for train set 
    user_model_trainBinary = KNN()
    user_model_trainBinary.fit(X=train_set_binary)
    
    sparse_user_similarity_train = user_model_trainBinary.get_weights()
    save_npz_data(args.path+"user_similarity_trainSet.npz", sparse_user_similarity_train)
    
    user_model_entireBinary = KNN()
    user_model_entireBinary.fit(X=entire_set_binary)
    
    sparse_user_similarity_entire = user_model_entireBinary.get_weights()
    save_npz_data(args.path+"user_similarity_entireSet.npz", sparse_user_similarity_entire)
示例#11
0
def classifier_train(model, device, train_dataset, optimizer, criterion, epoch, batch_size=100):
    total_loss = 0
    
    model.to(device)
    knn = KNN().to(device)

    for b in range(1):
        batch_dist = torch.zeros(10).to(device)
        for item_idx, (img, _) in enumerate(random.sample(list(train_dataset), batch_size)):
            # send to device
            img = img.to(device).unsqueeze(0)

            # forward pass
            optimizer.zero_grad()
            img_embedding, _, img_class = model(img)
            print(img_class)

            # get samples
            samples = random.sample(list(train_dataset), 1000)

          
            for s in range(len(samples)):
                s_img, _ = samples[s]
                embedding, _, s_class = model(s_img.to(device).unsqueeze(0))
                samples[s] = (embedding, s_class)

            # calculate loss
            # intial loss from model forward to one-hot
            desired_one_hot = torch.zeros(img_class.shape, dtype=torch.float).to(device)
            desired_one_hot[0][torch.argmax(img_class, dim=1).item()] = 1
            #desired_one_hot= torch.tensor([torch.argmax(img_class, dim=1).item()]).to(device)
            #invert_one_hot = torch.ones(img_class.shape, dtype=torch.float).to(device)
            #invert_one_hot[0][torch.argmax(img_class, dim=1).item()] = 0
            
            #loss = torch.nn.functional.mse_loss(img_class, desired_one_hot)
            #print(loss.item())
            loss = 0
            # generate stochastic nearest KNN for closest encodings
            neighbors = knn(img_embedding, samples, direction=1)
            n_loss = 0
            for (n_embed, n_class) in neighbors:
                #difference_output = torch.sub(img_embedding, n_embed)
                #loss += torch.nn.functional.mse_loss(difference_output, torch.zeros(*difference_output.shape).to(device))
                #loss += torch.nn.functional.binary_cross_entropy_with_logits(img_class, n_class.detach())
                #n_loss += torch.nn.functional.cross_entropy(n_class, desired_one_hot)
                n_loss += torch.nn.functional.binary_cross_entropy(n_class, desired_one_hot)

            print(n_loss.item())
            
            neighbors = knn(img_embedding, samples, direction=-1)
            f_loss = 0
            
            def random_nn():
                exclude=[torch.argmax(img_class, dim=1).item()]
                randInt = random.randint(0,9)
                return random_nn() if randInt in exclude else randInt 
            
            out = torch.zeros(img_class.shape).to(device)
            out[0][random_nn()] = 1

            for (n_embed, n_class) in neighbors:
                #difference_output = torch.sub(img_embedding, n_embed)
                #loss += torch.nn.functional.mse_loss(difference_output, torch.zeros(*difference_output.shape).to(device))
                f_loss -= 2 * torch.nn.functional.binary_cross_entropy(n_class, desired_one_hot)
                #f_loss += torch.nn.functional.cross_entropy(n_class, out)
                f_loss += torch.nn.functional.binary_cross_entropy(n_class, out)
                #f_loss += 1.5 * torch.nn.functional.mse_loss(n_class, 2 * invert_one_hot)

            print(f_loss.item())

            loss += f_loss + n_loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # batch entropy
            batch_dist[torch.argmax(img_class, dim=1).item()]+=1

            #if item_idx % 100 == 0:
            print(f"Epoch: {epoch} - Episode: {(b * 10) + item_idx} - Loss: {loss.item()}")

        optimizer.zero_grad()
        print(batch_dist)
        loss = -100 * batch_size * torch.nn.functional.kl_div(batch_dist/batch_size, torch.ones(batch_dist.shape).to(device)/batch_size)
        print(loss.item())
        loss.requires_grad = True
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss/len(train_dataset)
示例#12
0
# 2. Pretrain Model Encoder on Augmentations
model = MNIST_Classifier().to("cuda")
optimizer = torch.optim.RMSprop(list(model.encoder.parameters()) +
                                list(model.decoder.parameters()),
                                lr=1e-3)
criterion = torch.nn.MSELoss()

model.load_state_dict(torch.load("results/encoder_pretraining_100.pth"))
model.eval()

# 3. Generate KNN for random sample
x, _ = random.sample(list(test_def), 1)[0]
x_embed, _, s_class = model(x.to("cuda").unsqueeze(0))

knn = KNN()
samples = random.sample(list(test_def), 10000)

with torch.no_grad():
    for s in range(len(samples)):
        s_img, _ = samples[s]
        embedding, _, s_class = model(s_img.to("cuda").unsqueeze(0))
        samples[s] = (embedding, s_class, s_img)

# generate stochastic KNN for encodings
neighbors = knn(x_embed, samples)

fig, ax = plt.subplots(1, 6, constrained_layout=True)
np.vectorize(lambda ax: ax.axis('off'))(ax)

ax[0].imshow(x.squeeze(0))
示例#13
0
from sklearn.datasets import load_boston

from validation import classification as val
from models.decision_tree import DecisionTree
from models.knn import KNN
from models.random_forest import RandomForest
from preprocessing.features_enginering import normalize_dataset
from preprocessing.split import train_test_split
from validation.regression import sqrderr

# %%
X, y = load_boston(return_X_y=True)
normalize_dataset(X)
x_train, y_train, x_test, y_test = train_test_split(X, y, .8)

kclass = KNN(5, mode="regression")
kclass.fit(x_train, y_train)

res = kclass.predict(x_test)
knn_err = sqrderr(res, y_test)

kclass_w = KNN(5, mode="regression", method="weighted")
kclass_w.fit(x_train, y_train)

res = kclass_w.predict(x_test)
knn_w_err = sqrderr(res, y_test)

## %%

forest_w = RandomForest(mode="regression",
                        errfun="mse",
示例#14
0
def main():
    # Read file names
    parser = argparse.ArgumentParser()
    parser.add_argument("xTrain",
                        help="filename for features of the training data")
    parser.add_argument(
        "yTrain", help="filename for labels associated with training data")
    parser.add_argument("xTest", help="filename for features of the test data")

    args = parser.parse_args()

    # load the train and test data assumes you'll use numpy
    xTrain = pd.read_csv(args.xTrain)
    yTrain = pd.read_csv(args.yTrain)
    xTest = pd.read_csv(args.xTest)
    colNames = list(xTrain.keys())

    # visualize(xTrain, yTrain, colNames)

    models = {
        'boost': Boost(5, .2, 5),
        'dt': DT(25, 1, 'entropy'),
        'knn': KNN(1),
        'nb': NB(),
        'rf': RF(51, 25, 'gini', 25, 1),
        'svm': SVM(.1, 'poly', 3, .01)
    }

    X = xTrain.to_numpy()
    Y = yTrain.to_numpy()

    basePreds = []
    for k in models:
        models[k].train(X, Y)
        basePreds.append(list(models[k].predict(xTrain.to_numpy())))
    basePreds = np.array(basePreds)
    basePreds = np.transpose(basePreds)

    metalearner = Boost(5, .2, 5)

    nfolds = 3
    kf = KFold(nfolds)
    trIndices = []
    tsIndices = []
    for tr, ts in kf.split(X):
        trIndices.append(tr)
        tsIndices.append(ts)

    total = 0

    for i in range(nfolds):
        metalearner.train(X[trIndices[i], :], Y[trIndices[i], :])
        acc = metalearner.predAcc(X[tsIndices[i], :], Y[tsIndices[i], :])
        total += acc / nfolds

    print("ACC: ", total)

    metalearner.train(X, Y)
    testPreds = metalearner.predict(xTest.to_numpy())
    finalPreds = np.array([list(range(len(xTest))), testPreds]).transpose()
    finalPreds = pd.DataFrame(finalPreds, columns=['Id', 'Cover_Type'])
    finalPreds.to_csv('finalPredictions.csv', index=False)
    # print(finalPreds)

    freq = Counter(list(testPreds))
    labelMap = {
        1: 'Spruce/Fir',
        2: 'Lodgepole Pine',
        3: 'Ponderosa Pine',
        4: 'Cottonwood/Willow',
        5: 'Aspen',
        6: 'Douglas-fir',
        7: 'Krummholz'
    }

    label = [labelMap[k] for k in freq.keys()]
    no_trees = [freq[k] for k in freq.keys()]

    index = np.arange(len(label))
    plt.bar(index, no_trees)
    plt.xlabel('Cover type', fontsize=12)
    plt.ylabel('Number of samples', fontsize=12)
    plt.xticks(index, label, fontsize=12, rotation=30)
    plt.title('Class Frequency in prediction')
    plt.show()

    return