Пример #1
0
class QClassifierImpl:
    """
    A wrapper for question classifier
    """

    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path = self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions = self.extractor.questions)
        assert(len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
Пример #2
0
    def test_extract_features_4x4_returns_correct_dimensions_and_colour(self):
        input_image_df = pd.read_csv(io.StringIO("label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,pixel11,pixel12,pixel13,pixel14,pixel15\n0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0"))
        feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 1)

        features = feature_extractor.extract_features(input_image_df)

        self.assertEqual((1,15), features.shape)
        self.assertTrue(pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).compare(features).empty)
Пример #3
0
    def test_extract_features_x10_full_size_returns_correct_features(self):
        input_image_df = pd.read_csv("Data/train.csv", nrows=1)
        feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 2)

        features = feature_extractor.extract_features(input_image_df)

        self.assertEqual((1, 75), features.shape)
        self.assertTrue(pd.DataFrame([[0, 15.93750, 0, 0, 63.75, 0, 0, 63.75, 0, 1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, ]]).iloc[0,:].compare(features.iloc[0,:]).empty)
Пример #4
0
def test_signature(test_dir):
    F = []
    improc = ImageProcessor()
    ftextr = FeatureExtractor()
    img_files = find_image_files(test_dir)
    count = 1.0
    for ifile in img_files:
        print("Extracting features, " +
              str(round(count / len(img_files) * 100, 1)) + "% done ...")
        count += 1
        signature = Image.open(ifile)
        processed = improc.preprocess(signature)
        F.append(ftextr.extract_features(processed))

    np.ndarray(shape=(len(F), len(F[0])))
    F = np.array(F)

    np.ndarray(shape=(len(F), len(F[0])))
    F = np.array(F)
    F.dump(test_dir + "feature_dump")
Пример #5
0
def save_features(training_dir):
    print("Extracting features ...")
    F = []
    L = []
    improc = ImageProcessor()
    ftextr = FeatureExtractor()
    img_files = find_image_files(training_dir)
    count = 1.0
    for ifile in img_files:
        done = count / len(img_files) * 100
        print("\r[", end="")
        for i in range(int(done)):
            print("|", end="")
        for i in range(100 - int(done)):
            print(" ", end="")
        print("] " + ifile + ", " + str(round(done, 2)) + "%", end="")
        count += 1
        signature = Image.open(ifile)
        processed = improc.preprocess(signature)
        F.append(ftextr.extract_features(processed))

        if ifile.startswith("dataset/TrainingSet/D"):
            L.append(1)
        elif ifile.startswith("dataset/TrainingSet/G"):
            L.append(2)
        elif ifile.startswith("dataset/TrainingSet/R"):
            L.append(3)
        else:
            L.append(0)

    np.ndarray(shape=(len(F), len(F[0])))
    F = np.array(F)
    F.dump(training_dir + "feature_dump")

    np.ndarray(shape=(len(L), 1))
    L = np.array(L)
    L.dump(training_dir + "label_dump")
Пример #6
0
class QClassifierImpl:
    """
    A wrapper for question classifier
    """
    def __init__(self, train_data_path, pred_qs=None):
        """
        Constructor
        """
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filename='qclassifier.log',
            filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path=self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions=self.extractor.questions)
        assert (len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type='string', non_negative=True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
class VehicleClassifier():
    '''Classifier that detects vehicles in an image'''
    def __init__(self, trained_model_path=None):
        '''
        Initialise the object
        :param trained_model_path: full path to were the trained model and scaler will be stored
        '''
        self.trained_model_path = trained_model_path
        self.clf = None
        self.X_scaler = None
        self.trained = False
        self.feature_extractor = None

    def load_training_images(self, vehicle_path=None, non_vehicle_path=None):
        '''
        Load training set image names from disc

        :param vehicle_path: path to where the vehicle training images are stored
        :param non_vehicle_path: path to where the non-vehicle training images are stored
        :return: two lists with full paths to vehicle and non-vehicle images respectively
        '''
        vehicles = []
        non_vehicles = []

        # Vehicle images names
        print("Loading training image names...")
        for image in glob.glob(vehicle_path + '/**/*.png', recursive=True):
            vehicles.append(image)

        # Non-vehicle images names
        for image in glob.glob(non_vehicle_path + '/**/*.png', recursive=True):
            non_vehicles.append(image)

        print('    # of vehicle images: {}'.format(len(vehicles)))
        print('# of non-vehicle images: {}'.format(len(non_vehicles)))

        return vehicles, non_vehicles

    def extract_features(self, vehicles, non_vehicles):
        '''
        Extract features for the two lists containing vehicle and non-vehicle image paths respectively
        :param vehicles: list of paths to vehicle images
        :param non_vehicles: list of paths to non-vehicle images
        :return: scaled_X: normalised feature vector, y: true labels (1 = vehicle, 0 = non-vehicle)
        '''
        '''Load training set images and extract features'''
        self.feature_extractor = FeatureExtractor()

        print("Loading images and extracting features...")
        t = time.time()
        vehicle_features = self.feature_extractor.extract_features(
            vehicles,
            cspace=CSPACE,
            spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE),
            hist_bins=HIST_BIN,
            hist_range=HIST_RANGE,
            hog_cell_per_block=HOG_CELL_PER_BLOCK,
            hog_channel=HOG_CHANNEL,
            hog_pix_per_cell=HOG_PIX_PER_CELL,
            hog_orient=HOG_ORIENT_BINS)

        non_vehicle_features = self.feature_extractor.extract_features(
            non_vehicles,
            cspace=CSPACE,
            spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE),
            hist_bins=HIST_BIN,
            hist_range=HIST_RANGE,
            hog_cell_per_block=HOG_CELL_PER_BLOCK,
            hog_channel=HOG_CHANNEL,
            hog_pix_per_cell=HOG_PIX_PER_CELL,
            hog_orient=HOG_ORIENT_BINS)

        # Create an array stack of all feature vectors and scale the resulting feature vector
        X = np.vstack(
            (vehicle_features, non_vehicle_features)).astype(np.float64)
        self.X_scaler = StandardScaler().fit(X)
        scaled_X = self.X_scaler.transform(X)

        # Define the labels vector (1 = vehicle, 0 = non-vehicle)
        y = np.hstack((np.ones(len(vehicle_features)),
                       np.zeros(len(non_vehicle_features))))
        t2 = time.time()

        print('Number of features: {}'.format(scaled_X.shape[1]))
        print('Feature extraction time: {}'.format(round(t2 - t, 2)))

        return scaled_X, y

    def train(self, X, y):
        '''
        Train the classifier using training set X and true labels y

        :param X: array of feature vectors (normalised)
        :param y: array of true predictions
        '''

        # Split the training set into randomized training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=TEST_TRAIN_RATIO,
            random_state=RANDOM_STATE,
            stratify=y)

        # Train the linear SVC
        print("Training SVC...")
        self.clf = LinearSVC()
        t = time.time()
        self.clf.fit(X_train, y_train)
        t2 = time.time()
        print('Classifier training time: {}'.format(round(t2 - t, 2)))

        self.trained = True

        # Check the score of the SVC on the test set
        print('Classifier test set accuracy: {}'.format(
            round(self.score(X_test, y_test), 4)))

    def save_model(self, name):
        '''
        Save the trained model and scaler to disc

        :param name: name of the model ("_model.pkl" will be added to the name)
        '''
        if self.trained:
            joblib.dump(self.clf,
                        self.trained_model_path + '/' + name + '_model.pkl')
            joblib.dump(self.X_scaler,
                        self.trained_model_path + '/' + name + '_scaler.pkl')
        else:
            print("ERROR: model not yet trained")

    def load_model(self, name):
        '''
        Load a trained model from disc

        :param name: name of the model ("_model.pkl" will be added to the name)
        '''
        self.__init__(self.trained_model_path)

        # Load the trained classifier and the scaler
        self.clf = joblib.load(self.trained_model_path + '/' + name +
                               '_model.pkl')
        self.X_scaler = joblib.load(self.trained_model_path + '/' + name +
                                    '_scaler.pkl')
        self.feature_extractor = FeatureExtractor()

        self.trained = True

    def predict(self, X):
        '''
        Make predictions for test set in X

        :param X: array of feature vectors
        :return: array of predictions
        '''
        if self.trained:
            return self.clf.predict(X)
        else:
            print("ERROR: model not yet trained")

    def score(self, X, y):
        '''
        Determine the classifier accuracy given test set X and true labels y

        :param X: array of feature vectors (normalised)
        :param y: array of predictions
        :return: classifier accuracy
        '''
        if self.trained:
            return self.clf.score(X, y)
        else:
            print("ERROR: model not yet trained")
Пример #8
0
import pandas as pd
import csv
import logging

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from FeatureExtractor import FeatureExtractor

if __name__ == '__main__':
    logger = logging.Logger("MainLogger")
    digit_recogniser = FeatureExtractor(logger, 28, 28,2,True)

    pixel_training_data_df = pd.read_csv("Data/train.csv")
    training_feature_df = digit_recogniser.extract_features(pixel_training_data_df)

    print(training_feature_df.shape)
    x_data = training_feature_df.iloc[:, 1:]
    y_data = training_feature_df.iloc[:, [0]]

    scaler = preprocessing.StandardScaler().fit(x_data)
    x_data_scaled = scaler.transform(x_data)

    logistic_model = LogisticRegression(random_state=0, penalty='l1', solver='saga', tol=0.1).fit(x_data_scaled,
                                                                                                  y_data.values[:, 0])
    accuracy = logistic_model.score(x_data_scaled, y_data.values[:, 0])

    print(f"Training completed. Accuracy Rate: {accuracy * 100}%")

    # create predictions for the test data now
    pixel_test_data_df = pd.read_csv("Data/test.csv")
    digit_recogniser = FeatureExtractor(logger, 28, 28, False)