示例#1
0
 def _split(self):
     """
     Splits data and labels into training, validation, and test sets.
     """
     self.train, valtest, self.train_labels, valtest_labels = split_data(
         self.symptoms_data,
         self.disease_data,
         shuffle=True,
         train_size=.7,
         test_size=.3)
     self.valid, self.test, self.valid_labels, self.test_labels = split_data(
         valtest, valtest_labels, shuffle=True, train_size=.5, test_size=.5)
    def run(self):
        print('Loading data and building vocabulary.')
        train_df = self.load_quora()

        # Split to train validation
        validation_size = 2000
        training_size = len(train_df) - validation_size

        X = train_df[self.questions_cols]
        Y = train_df['is_duplicate']

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
            X, Y, test_size=validation_size)

        # Split to lists
        self.x_train = [self.x_train.question1, self.x_train.question2]
        self.x_val = [self.x_val.question1, self.x_val.question2]
        # self.x_test = [test_df.question1, test_df.question2]

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values

        print('Padding Sequences.')
        self.pad_sequences()
示例#3
0
    def run(self):
        # Loading data and building vocabulary.
        data_df = self.load_data()
        data_size = len(data_df)

        X = data_df[self.sequence_cols]
        Y = data_df[self.score_col]

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
            X, Y, train_size=self.train_ratio)

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values

        training_pairs = []
        training_scores = []
        validation_pairs = []
        validation_scores = []

        # Split to lists
        i = 0
        for index, row in self.x_train.iterrows():
            sequence_1 = row[self.sequence_cols[0]]
            sequence_2 = row[self.sequence_cols[1]]
            if len(sequence_1) > 0 and len(sequence_2) > 0:
                training_pairs.append([sequence_1, sequence_2])
                training_scores.append(float(self.y_train[i]))
            i += 1
        self.x_train = training_pairs
        self.y_train = training_scores

        print('Number of Training Positive Samples   :', sum(training_scores))
        print('Number of Training Negative Samples   :',
              len(training_scores) - sum(training_scores))

        i = 0
        for index, row in self.x_val.iterrows():
            sequence_1 = row[self.sequence_cols[0]]
            sequence_2 = row[self.sequence_cols[1]]
            if len(sequence_1) > 0 and len(sequence_2) > 0:
                validation_pairs.append([sequence_1, sequence_2])
                validation_scores.append(float(self.y_val[i]))
            i += 1

        self.x_val = validation_pairs
        self.y_val = validation_scores

        print('Number of Validation Positive Samples   :',
              sum(validation_scores))
        print('Number of Validation Negative Samples   :',
              len(validation_scores) - sum(validation_scores))

        assert len(self.x_train) == len(self.y_train)
        assert len(self.x_val) == len(self.y_val)

        self.convert_to_tensors()
示例#4
0
    def split_data(self, data_df):
        data_size = len(data_df)

        X = data_df[self.sequence_cols]
        Y = data_df[self.score_col]

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
            X, Y, train_size=self.train_ratio)

        self.x_train = [self.x_train[column] for column in self.sequence_cols]
        self.x_val = [self.x_val[column] for column in self.sequence_cols]
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values
示例#5
0
    def __init__(self, train_test_split=70, filename="data.csv", norm=True):
        self.hp = {
            "num_filt_1": 8,  #Number of filters in first conv layer
            "num_filt_2": 4,
            "num_filt_3": 2,
            "num_fc_1": 12,  #number of neurons in fully connected layer
            "max_iterations": 500,
            "batch_size": 16,
            "dropout": 0.70,
            "learning_rate": 2e-5,
            "input_norm": False
        }
        self.__filename = "data.csv"
        self.__ttsplit = train_test_split * 0.01
        self.norm = norm
        """loading data"""
        self.data = np.loadtxt(self.__filename, delimiter=',')
        self.data_train, self.data_test_val = split_data(
            self.data, test_size=self.__ttsplit)

        self.data_test, self.data_val = np.array_split(self.data_test_val, 2)

        self.X_train = self.data_train[:, 1:]
        self.X_val = self.data_val[:, 1:]
        self.X_test = self.data_test[:, 1:]
        self.__N = self.X_train.shape[0]
        print("N: " + str(self.__N))
        self.__D = self.X_train.shape[1]
        self.y_train = self.data_train[:, 0]
        self.y_val = self.data_val[:, 0]
        self.y_test = self.data_test[:, 0]
        print("We have %s observations with %s dimensions" %
              (self.__N, self.__D))

        self.num_classes = len(np.unique(self.y_train))
        base = np.min(self.y_train)
        if base != 0:  #checks if labels are zero based
            self.y_train -= base
            self.y_val -= base
            self.y_test -= base

        if self.norm:
            self.input_normalize()
    def run(self):
        # Loading data and building vocabulary.
        data_df = self.load_data()

        X = data_df[self.sequence_cols]
        Y = data_df[self.score_col]

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
            X, Y, test_size=self.instances, shuffle=False)

        # Split to lists
        self.x_train = [self.x_train[column] for column in self.sequence_cols]
        self.x_val = [self.x_val[column] for column in self.sequence_cols]

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values

        # Padding Sequences.
        self.pad_sequences()
示例#7
0
    def run(self):
        print('Loading data and building vocabulary.')
        train_df = self.load_sick()

        # Split to train validation
        validation_size = int(len(train_df) * 0.2)
        training_size = len(train_df) - validation_size

        X = train_df[self.sentence_cols]
        Y = train_df['relatedness_score']

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
            X, Y, test_size=validation_size)

        # Split to lists
        self.x_train = [self.x_train.sentence_A, self.x_train.sentence_B]
        self.x_val = [self.x_val.sentence_A, self.x_val.sentence_B]

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values

        print('Padding Sequences.')
        self.pad_sequences()
示例#8
0
    def create_data(self):
        # Loading data and building vocabulary.
        data_df = self.read_data()
        X = data_df[self.final_name]
        Y = data_df[self.score_col]

        self.question_index = []
        self.x_train, self.x_val_temp, self.y_train, self.y_val_temp = split_data(
            X,
            Y,
            train_size=self.train_ratio,
            stratify=Y,
            random_state=self.seed)
        self.x_test, self.x_val, self.y_test, self.y_val = split_data(
            self.x_val_temp,
            self.y_val_temp,
            train_size=self.test_val_ratio,
            stratify=self.y_val_temp,
            random_state=self.seed)

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values
        self.y_test = self.y_test.values

        bins = np.bincount(Y)
        class_bin = []
        for i in range(0, self.class_number):
            class_bin.append(bins[i])

        self.final_weights = []
        if self.weight_policy == 0:
            max_number = 1
            for i in range(0, self.class_number):
                self.final_weights.append(float(max_number) / class_bin[i])

        elif self.weight_policy == 1:
            max_number = max(class_bin)
            for i in range(0, self.class_number):
                self.final_weights.append(float(max_number) / class_bin[i])

        elif self.weight_policy == 2:
            n_samples = Y.count()
            n_classes = bins.shape[0]
            for i in range(0, self.class_number - 1):
                self.final_weights.append(
                    float(n_samples) / (n_classes * class_bin[i]))

        features = self.read_features(data=0)
        self.x_train = features[0]
        self.y_train = features[1]

        features = self.read_features(data=1)
        self.x_test = features[0]
        self.y_test = features[1]

        features = self.read_features(data=2)
        self.x_val = features[0]
        self.y_val = features[1]

        assert len(self.x_train) == len(self.y_train)
        assert len(self.x_val) == len(self.y_val)
        assert len(self.x_test) == len(self.y_test)

        self.to_pytorch_tensors()
        return