def __init_data__(self, args):
        '''
		Initialize preprocessing from raw dataset to dataset split into training and testing
		Training and test datasets are index strings that refer to tokens
		'''
        self.preprocessing = Preprocessing(args)
        self.preprocessing.load_data()
        self.preprocessing.prepare_tokens()

        raw_x_train = self.preprocessing.x_train
        raw_x_test = self.preprocessing.x_test

        self.y_train = self.preprocessing.y_train
        self.y_test = self.preprocessing.y_test

        self.x_train = self.preprocessing.sequence_to_token(raw_x_train)
        self.x_test = self.preprocessing.sequence_to_token(raw_x_test)
示例#2
0
 def test_standardization(self):
     X_train = np.array([[1,2], [2,4], [3,6]])
     X_test = np.array([[4,8], [5,10]])
     pre_X_train, pre_X_test, _ = Preprocessing.standardization(X_train, X_test, mode='zscore')
     print()
     print("test_standardization ===========================")
     print('X_train => \n', X_train)
     print('X_test => \n', X_test)
示例#3
0
 def test_prep_missing_val(self):
     X_train = np.array([[np.nan, 1], [2,3], [np.nan, np.nan]])
     X_test = np.array([[np.nan, 1], [2,3]])
     y_train = np.array([1,2,3])
     y_test = np.array([4,5])
     pre_X_train, pre_X_test, pre_y_train, pre_y_train = Preprocessing.prep_missing_val(X_train, X_test, y_train, y_test, mode='remove')
     print()
     print('test_prep_missing_val ==========================')
     print('X_train => ', X_train, 'y_train => ', y_train)
     print('pre_X_train => ', pre_X_train, 'pre_X_test => ', pre_X_test)
def normalizeData(inputDataClass):
    ######################################## Normalising Data ####################################
    normalizer = Preprocessing.Normalise()
    inputDataClass.Train = np.hstack(
        (normalizer.scale(inputDataClass.Train[:, :-1],
                          train=True), inputDataClass.Train[:,
                                                            -1].reshape(-1,
                                                                        1)))
    inputDataClass.Test = np.hstack(
        (normalizer.scale(inputDataClass.Test[:, :-1],
                          train=False), inputDataClass.Test[:,
                                                            -1].reshape(-1,
                                                                        1)))
def var_vs_comp(X, start, stop, step):
	print("Making variance v/s components plot. . . ")
	components = []
	variances = []
	d = X.shape[1]
	i_cols = np.arange(start, stop, step)
	for k in i_cols:
		pca = Preprocessing.PCA(X, k = k, whiten = False)
		components.append(k)
		variances.append(pca.var_retained)
		
	plt.plot(components, variances)
	plt.ylabel('variance retained')
	plt.xlabel('number of components')
	plt.show()
def performPCA(inputDataClass, reduced_columns):
    ############################################## PCA Visualisation #############################################
    # #variance v/s n_components : Fashion MNIST
    # start = 10
    # stop = 500
    # step = 15
    # Visualization.var_vs_comp(inputDataClass.Train[:,:-1], start, stop, step)
    ########################################################### PCA #############################################

    ##### Our PCA ####
    pca = Preprocessing.PCA(inputDataClass.Train[:, :-1],
                            k=reduced_columns,
                            whiten=False)  ##### Hyperparameter ####
    reduced_train = pca.reduce(inputDataClass.Train[:, :-1], True)
    inputDataClass.Train = np.hstack(
        (reduced_train, inputDataClass.Train[:, -1].reshape(-1, 1)))
    print("train_data reduced.")
    print("Train data reduced to columns = " + str(reduced_train.shape[1]))
    reduced_test = pca.reduce(inputDataClass.Test[:, :-1], False)
    inputDataClass.Test = np.hstack(
        (reduced_test, inputDataClass.Test[:, -1].reshape(-1, 1)))
    print("test_data reduced. ")
    print("Test data reduced to columns = " + str(reduced_test.shape[1]))
# All components of DataGenerator are:
# load file, preprocessing, augmentation, batch creation, encoder output
inputs_file_loader = dict(format=img_format)
file_loader = FileLoader(**inputs_file_loader)

# Generator of inputs for the model
inputs_model = InputsModel(model_name=model_name)
batch_creator = inputs_model.create_batch

# Define preprocessing
# preprocessing inputs
target_size = 250

preprocessing = Preprocessing([
    ("rescale", dict(target_size=target_size)),
])

# Define augmentation
augmentation = None  #Augmentation()

encoder_output = EncoderOutput(
    order_output_model=order_output_model,
    encode_labels=
    encode_labels  # dict to map labels in 'data/labels.json' to other classes
)

# shared configuration for both train and test data generators
config_generator = dict(labels=labels,
                        file_loader=file_loader,
                        batch_creator=batch_creator,
    def prepare_data(num_words, seq_len):
        # Preprocessing pipeline
        pr = Preprocessing(num_words, seq_len)
        pr.load_data()
        pr.clean_text()
        pr.text_tokenization()
        pr.build_vocabulary()
        pr.word_to_idx()
        pr.padding_sentences()
        pr.split_data()

        return {
            'x_train': pr.x_train,
            'y_train': pr.y_train,
            'x_test': pr.x_test,
            'y_test': pr.y_test
        }
class Execute:
    '''
	Class for execution. Initializes the preprocessing as well as the 
	Tweet Classifier model
	'''
    def __init__(self, args):
        self.__init_data__(args)

        self.args = args
        self.batch_size = args.batch_size

        self.model = TweetClassifier(args)

    def __init_data__(self, args):
        '''
		Initialize preprocessing from raw dataset to dataset split into training and testing
		Training and test datasets are index strings that refer to tokens
		'''
        self.preprocessing = Preprocessing(args)
        self.preprocessing.load_data()
        self.preprocessing.prepare_tokens()

        raw_x_train = self.preprocessing.x_train
        raw_x_test = self.preprocessing.x_test

        self.y_train = self.preprocessing.y_train
        self.y_test = self.preprocessing.y_test

        self.x_train = self.preprocessing.sequence_to_token(raw_x_train)
        self.x_test = self.preprocessing.sequence_to_token(raw_x_test)

    def train(self):

        training_set = DatasetMaper(self.x_train, self.y_train)
        test_set = DatasetMaper(self.x_test, self.y_test)

        self.loader_training = DataLoader(training_set,
                                          batch_size=self.batch_size)
        self.loader_test = DataLoader(test_set)

        optimizer = optim.RMSprop(self.model.parameters(),
                                  lr=args.learning_rate)
        for epoch in range(args.epochs):

            predictions = []

            self.model.train()

            for x_batch, y_batch in self.loader_training:

                x = x_batch.type(torch.LongTensor)
                y = y_batch.type(torch.FloatTensor)

                y_pred = self.model(x)

                loss = F.binary_cross_entropy(y_pred, y)

                optimizer.zero_grad()

                loss.backward()

                optimizer.step()

                predictions += list(y_pred.squeeze().detach().numpy())

            test_predictions = self.evaluation()

            train_accuary = self.calculate_accuray(self.y_train, predictions)
            test_accuracy = self.calculate_accuray(self.y_test,
                                                   test_predictions)

            print(
                "Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f"
                % (epoch + 1, loss.item(), train_accuary, test_accuracy))

    def evaluation(self):

        predictions = []
        self.model.eval()
        with torch.no_grad():
            for x_batch, y_batch in self.loader_test:
                x = x_batch.type(torch.LongTensor)
                y = y_batch.type(torch.FloatTensor)

                y_pred = self.model(x)
                predictions += list(y_pred.detach().numpy())

        return predictions

    @staticmethod
    def calculate_accuray(grand_truth, predictions):
        true_positives = 0
        true_negatives = 0

        for true, pred in zip(grand_truth, predictions):
            if (pred > 0.5) and (true == 1):
                true_positives += 1
            elif (pred < 0.5) and (true == 0):
                true_negatives += 1
            else:
                pass

        return (true_positives + true_negatives) / len(grand_truth)