Пример #1
0
    def run( self ):

        """ Run the main code to train classifier and save to file

        :return: None

        """

        # extract features from dataframes
        # features = pandas.DataFrame( hog_classifier.feature_extraction(self.data,
        #                                                                image_col_name=self.image_col_name) )
        # print('Extracting features')
        features = feature_extractions.feature_extraction(self.data,
                                                          input_parameters=self.image_col_name,
                                                          method=self.feature_method,
                                                          extraction_options=self.feature_options,
                                                          image_process_options=self.image_processing_options_for_feature)

        # create target array which contains the correct answer
        target = pandas.DataFrame(numpy.repeat([True], len(features)))
        target[self.data[self.category_col_name].values == '0'] = False

        # initialize classifier
        print( "Initializing classifier" )
        classifier = sklearn.svm.SVC( kernel=self.kernel, gamma=self.gamma, C=self.C, verbose=True, probability=True )

        # train the classifier
        print( "Training classifier" )
        classifier.fit( features, target.values.ravel() )
        print()

        # save the classifier to be loaded later
        print( "Saving classifier to file, " + str( classifier_output ) )
        sklearn.externals.joblib.dump( classifier, classifier_output )
Пример #2
0
    def run(self):
        """ Run the main code to train classifier and save to file

        :return: None

        """

        # process images
        if self.image_processing_options:
            print("Preprocessing images")
            image_df = kaggle_reader.load_all_raw_images(self.raw_image_path)
            processed_data = image_processing.process_roi_extractions(
                image_df, self.data, self.image_processing_options)
        else:
            processed_data = self.data
        # extract features from dataframes
        print("Extracting features")
        features = feature_extractions.feature_extraction(
            processed_data,
            input_parameters=self.image_col_name,
            method=self.feature_method,
            extraction_options=self.feature_options)

        # create target array which contains the correct answer
        target = pandas.DataFrame(numpy.repeat([True], len(features)))
        target[self.data[self.category_col_name].values == '0'] = False

        # initialize classifier
        print("Initializing classifier")
        classifier = sklearn.neural_network.MLPClassifier(
            solver=self.solver,
            alpha=self.alpha,
            tol=self.tol,
            max_iter=self.max_iter,
            hidden_layer_sizes=self.hidden_layer_sizes,
            random_state=self.random_state,
            verbose=True)
        print(classifier)
        print()

        # train the classifier
        print("Training classifier")
        classifier.fit(features, target.values.ravel())
        print()

        # save the classifier to be loaded later
        print("Saving classifier: " + str(self.classifier_pkl))
        sklearn.externals.joblib.dump(classifier, self.classifier_pkl)

        return
Пример #3
0
    def run(self):
        """ Run the main code to train classifier and save to file

        :return: None

        """

        # process images
        if self.image_processing_options:
            print('Preprocessing images')
            image_df = kaggle_reader.load_all_raw_images(self.raw_image_path)
            processed_data = image_processing.process_roi_extractions(
                image_df, self.data, self.image_processing_options)
        else:
            processed_data = self.data
        # extract features from dataframes
        print('Extracting features')
        features = feature_extractions.feature_extraction(
            processed_data,
            input_parameters=self.image_col_name,
            method=self.feature_method,
            extraction_options=self.feature_options)

        # create target array which contains the correct answer
        target = pandas.DataFrame(numpy.repeat([True], len(features)))
        target[self.data[self.category_col_name].values == '0'] = False

        # initialize classifier
        print("Initializing classifier")
        classifier = sklearn.svm.SVC(kernel=self.kernel,
                                     gamma=self.gamma,
                                     C=self.C,
                                     verbose=True)

        # train the classifier
        print("Training classifier")
        classifier.fit(features, target.values.ravel())
        print()

        # save the classifier to be loaded later
        print("Saving classifier: " + str(self.classifier_pkl))
        sklearn.externals.joblib.dump(classifier, self.classifier_pkl)
Пример #4
0
    def run(self):
        """ Run the main code to test classifier and output results

        :return: None

        """

        # loading pre-trained classifier
        print("Loading external classifier: " + self.classifier_pkl)
        try:
            external_classifier = sklearn.externals.joblib.load(
                self.classifier_pkl)
        except OSError:
            print('Please check the file location for the trained classifier.')
            print('Tried: ' + self.classifier_pkl)
            return

        # process images
        if self.image_processing_options:
            print('Preprocessing images')
            image_df = kaggle_reader.load_all_raw_images(self.raw_image_path)
            processed_data = image_processing.process_roi_extractions(
                image_df, self.data, self.image_processing_options)
        else:
            processed_data = self.data
        # extract features from dataframes
        print('Extracting features')
        # features = pandas.DataFrame( processed_data.image_matrix.apply( numpy.ravel ).apply( pandas.Series ) )
        features = feature_extractions.feature_extraction(
            processed_data,
            input_parameters=self.image_col_name,
            method=self.feature_method,
            extraction_options=self.feature_options)

        # run prediction
        print('Running prediction')
        predicted = external_classifier.predict(features)

        # create array of expected results
        print('Organizing expected results')
        expected = pandas.DataFrame(numpy.repeat([True], len(features)))
        expected[self.data[self.category_col_name].values == '0'] = False

        # preparing to analyze predictions
        analyze = analyze_predictions.AnalyzePredictions()
        analyze.set_expected(pandas.DataFrame(expected))
        analyze.set_predicted(pandas.DataFrame(predicted))
        analyze.set_test_set(self.data)

        print()
        print("########################")
        print("# CLASSIFIER")
        print("#")
        print()
        print(external_classifier)
        print()
        analyze.print_summary_results()

        # save results to directory
        if not self.prediction_output_dir is None:
            print('Saving predictions to disk')
            correct = analyze.get_correct_predictions()
            incorrect = analyze.get_incorrect_predictions()
            analyze.save_predictions_as_images(
                correct, os.path.join(self.prediction_output_dir, 'correct'))
            analyze.save_predictions_as_images(
                incorrect, os.path.join(self.prediction_output_dir,
                                        'incorrect'))

        return
Пример #5
0
def run_moving_window(classifier,
                      image_array,
                      feature_method,
                      feature_options,
                      image_options,
                      window_sizes,
                      step_sizes,
                      nms_threshold,
                      plot=False,
                      padding="constant"):
    """
    Run the moving window approach

    :param classifier: classifier to use for prediction
    :param image_array: input image data
    :param feature_method: the feature to extract
    :param feature_options: options for feature extraction, e.g., for Gaussian kernel this is the kernel sigmas.
                            Current options include:
                            'gkhp': Gaussian kernel Hadamart product feature, requires a list of Gaussian kernel sigmas
                            as extraction options
                            'hog': Histogram of gradients, requires options in a dict if none-default options is desired
                            'pixelval': extracting pixel values as features, no options needed (use None as input)
    :param image_options: options for preprocessing images before feature extractions, e.g., {'rgb2gray': None} to
                          convert RGB image into grayscale. If no option required, use {} (enpty dict)
    :param window_sizes: sliding window size (nrow, ncol)
    :param step_sizes: step size for the sliding window (nrow, ncol)
    :param nms_threshold: Threshold parameter for the non-maximum suppression
    algorithm specifying the maximum allowable overlap
    :return: None
    """

    # preprocess the image
    print("Preprocess image")
    processed_image = image_processing.process_image(image_array,
                                                     image_options)

    # extract features from dataframes
    print("Extracting sub images", end=" ")
    start_time = time.time()
    boxes = extract_windowed_subimages_from_image(processed_image,
                                                  window_sizes,
                                                  step_sizes,
                                                  padding=padding)
    print(time.time() - start_time)

    if not (type(classifier).__name__
            == "SVC") and feature_method == "pixelval":
        print("Using fast method")
        boxes = boxes[[
            classifier.predict_proba([x.ravel()])[0][1] > 0.5
            for x in boxes.ImageMat
        ]]
        return boxes, boxes

    # extraction features according the feature_method parameter and feature_options, images could
    # be processed according to image_options if needed
    print("Extracting features", end=" ")
    start_time = time.time()
    features = feature_extractions.feature_extraction(boxes, 'ImageMat',
                                                      feature_method,
                                                      feature_options)
    print(time.time() - start_time)

    # run prediction
    print("Running prediction", end=" ")
    start_time = time.time()
    if type(classifier).__name__ == "SVC":
        boxes["classification"] = classifier.predict(features)
        positive_boxes = boxes[boxes.classification == True]
    else:
        boxes["prediction"] = classify_boxes(features, classifier)
        boxes = boxes[boxes.prediction > 0.5]
    print(time.time() - start_time)

    start_time = time.time()
    del features
    del boxes
    print("Removing variables time" + str(time.time() - start_time))

    print("eliminating overlap", end=" ")
    start_time = time.time()
    if nms_threshold < 1:
        reduced_positive_boxes = NSM.remove_boxes_with_NSM(
            positive_boxes, nms_threshold)
    else:
        reduced_positive_boxes = positive_boxes
    print(time.time() - start_time)
    if plot:
        display_boxes(image_array, reduced_positive_boxes, positive_boxes)

    return positive_boxes, reduced_positive_boxes
Пример #6
0
def tune(main, model, tuned_parameters, feature_method="pixelval", fraction=1, diversity_vars=None, iterations=50, bayes=False, verbose=0, n_jobs=1, random_state=1):
    """

    :param main: Classifier object
    :param model: Classifier
    :param tuned_parameters: Grid with hyperparameters
    :param feature_method: string describing method to extract features
    :param fraction: Fraction of data used to tune hyperparameters
    :param diversity_vars: Variable names of which diversity should be maintained
    :param iterations: Amount of iteration for Bayesian search
    :param bayes: boolean indicating to use Bayesian search (True) or normal search (False)
    :param verbose: How much to output (e.g. 0, 10, 50)
    :param n_jobs: Number of jobs to run in parallel (e.g. 1, 2, 4, 8)
    :param random_state: Basically the random seed (e.g. 1)
    :return: Classifier object and search object
    """

    data = split_data.slice_smaller_subset_of_data(main.data, fraction=fraction, diversity_vars=diversity_vars,
                                                   random_state=random_state)

    # preprocessing images
    if main.image_processing_options:
        print('Preprocessing images')
        image_df = kaggle_reader.load_all_raw_images(main.raw_image_path)
        processed_data = image_processing.process_roi_extractions(image_df, data,
                                                                  main.image_processing_options)
    else:
        processed_data = data
    # extract features from dataframes
    print('Extracting features using: '+main.feature_method)
    features = pandas.DataFrame( feature_extractions.feature_extraction(processed_data,
                                                                        input_parameters=main.image_col_name,
                                                                        method=main.feature_method,
                                                                        extraction_options=main.feature_options))

    # create target array which contains the correct answer
    target = pandas.DataFrame(numpy.repeat([True], len(features)))
    target[data[main.category_col_name].values == '0'] = False

    # take the sliced data to further split into a training and x-validation (for hyper-parameter tuning) sets
    print('Preparing training and cross-validation sets')
    train_indices = split_data.split_data_stratified(data = data, fraction = 0.8, diversity_vars = diversity_vars, random_state = random_state)
    train_indices = features.index.isin(features.index[train_indices])
    train_features = features[train_indices]
    train_target = target[train_indices].values.ravel()
    test_features = features[train_indices == False]
    test_target = target[train_indices == False].values.ravel()

    inner_cv = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=1)

    print('Begin training and tuning')
    #if bayes:
    #    opt = skopt.BayesSearchCV(model, tuned_parameters, n_iter=iterations, cv=inner_cv, verbose=True)
    #else:
    opt = sklearn.model_selection.GridSearchCV(model, tuned_parameters, cv=inner_cv, scoring= "neg_log_loss", verbose=verbose, n_jobs=n_jobs)
    opt.fit(train_features, train_target)
    print(opt.best_params_)
    print("Train score: %s" % opt.best_score_)
    print("Test score: %s" % opt.score(test_features, test_target))

    for name, value in opt.best_params_.items():
        setattr(main, name, value)
    return main, opt