def _split(self): """ Splits data and labels into training, validation, and test sets. """ self.train, valtest, self.train_labels, valtest_labels = split_data( self.symptoms_data, self.disease_data, shuffle=True, train_size=.7, test_size=.3) self.valid, self.test, self.valid_labels, self.test_labels = split_data( valtest, valtest_labels, shuffle=True, train_size=.5, test_size=.5)
def run(self): print('Loading data and building vocabulary.') train_df = self.load_quora() # Split to train validation validation_size = 2000 training_size = len(train_df) - validation_size X = train_df[self.questions_cols] Y = train_df['is_duplicate'] self.x_train, self.x_val, self.y_train, self.y_val = split_data( X, Y, test_size=validation_size) # Split to lists self.x_train = [self.x_train.question1, self.x_train.question2] self.x_val = [self.x_val.question1, self.x_val.question2] # self.x_test = [test_df.question1, test_df.question2] # Convert labels to their numpy representations self.y_train = self.y_train.values self.y_val = self.y_val.values print('Padding Sequences.') self.pad_sequences()
def run(self): # Loading data and building vocabulary. data_df = self.load_data() data_size = len(data_df) X = data_df[self.sequence_cols] Y = data_df[self.score_col] self.x_train, self.x_val, self.y_train, self.y_val = split_data( X, Y, train_size=self.train_ratio) # Convert labels to their numpy representations self.y_train = self.y_train.values self.y_val = self.y_val.values training_pairs = [] training_scores = [] validation_pairs = [] validation_scores = [] # Split to lists i = 0 for index, row in self.x_train.iterrows(): sequence_1 = row[self.sequence_cols[0]] sequence_2 = row[self.sequence_cols[1]] if len(sequence_1) > 0 and len(sequence_2) > 0: training_pairs.append([sequence_1, sequence_2]) training_scores.append(float(self.y_train[i])) i += 1 self.x_train = training_pairs self.y_train = training_scores print('Number of Training Positive Samples :', sum(training_scores)) print('Number of Training Negative Samples :', len(training_scores) - sum(training_scores)) i = 0 for index, row in self.x_val.iterrows(): sequence_1 = row[self.sequence_cols[0]] sequence_2 = row[self.sequence_cols[1]] if len(sequence_1) > 0 and len(sequence_2) > 0: validation_pairs.append([sequence_1, sequence_2]) validation_scores.append(float(self.y_val[i])) i += 1 self.x_val = validation_pairs self.y_val = validation_scores print('Number of Validation Positive Samples :', sum(validation_scores)) print('Number of Validation Negative Samples :', len(validation_scores) - sum(validation_scores)) assert len(self.x_train) == len(self.y_train) assert len(self.x_val) == len(self.y_val) self.convert_to_tensors()
def split_data(self, data_df): data_size = len(data_df) X = data_df[self.sequence_cols] Y = data_df[self.score_col] self.x_train, self.x_val, self.y_train, self.y_val = split_data( X, Y, train_size=self.train_ratio) self.x_train = [self.x_train[column] for column in self.sequence_cols] self.x_val = [self.x_val[column] for column in self.sequence_cols] self.y_train = self.y_train.values self.y_val = self.y_val.values
def __init__(self, train_test_split=70, filename="data.csv", norm=True): self.hp = { "num_filt_1": 8, #Number of filters in first conv layer "num_filt_2": 4, "num_filt_3": 2, "num_fc_1": 12, #number of neurons in fully connected layer "max_iterations": 500, "batch_size": 16, "dropout": 0.70, "learning_rate": 2e-5, "input_norm": False } self.__filename = "data.csv" self.__ttsplit = train_test_split * 0.01 self.norm = norm """loading data""" self.data = np.loadtxt(self.__filename, delimiter=',') self.data_train, self.data_test_val = split_data( self.data, test_size=self.__ttsplit) self.data_test, self.data_val = np.array_split(self.data_test_val, 2) self.X_train = self.data_train[:, 1:] self.X_val = self.data_val[:, 1:] self.X_test = self.data_test[:, 1:] self.__N = self.X_train.shape[0] print("N: " + str(self.__N)) self.__D = self.X_train.shape[1] self.y_train = self.data_train[:, 0] self.y_val = self.data_val[:, 0] self.y_test = self.data_test[:, 0] print("We have %s observations with %s dimensions" % (self.__N, self.__D)) self.num_classes = len(np.unique(self.y_train)) base = np.min(self.y_train) if base != 0: #checks if labels are zero based self.y_train -= base self.y_val -= base self.y_test -= base if self.norm: self.input_normalize()
def run(self): # Loading data and building vocabulary. data_df = self.load_data() X = data_df[self.sequence_cols] Y = data_df[self.score_col] self.x_train, self.x_val, self.y_train, self.y_val = split_data( X, Y, test_size=self.instances, shuffle=False) # Split to lists self.x_train = [self.x_train[column] for column in self.sequence_cols] self.x_val = [self.x_val[column] for column in self.sequence_cols] # Convert labels to their numpy representations self.y_train = self.y_train.values self.y_val = self.y_val.values # Padding Sequences. self.pad_sequences()
def run(self): print('Loading data and building vocabulary.') train_df = self.load_sick() # Split to train validation validation_size = int(len(train_df) * 0.2) training_size = len(train_df) - validation_size X = train_df[self.sentence_cols] Y = train_df['relatedness_score'] self.x_train, self.x_val, self.y_train, self.y_val = split_data( X, Y, test_size=validation_size) # Split to lists self.x_train = [self.x_train.sentence_A, self.x_train.sentence_B] self.x_val = [self.x_val.sentence_A, self.x_val.sentence_B] # Convert labels to their numpy representations self.y_train = self.y_train.values self.y_val = self.y_val.values print('Padding Sequences.') self.pad_sequences()
def create_data(self): # Loading data and building vocabulary. data_df = self.read_data() X = data_df[self.final_name] Y = data_df[self.score_col] self.question_index = [] self.x_train, self.x_val_temp, self.y_train, self.y_val_temp = split_data( X, Y, train_size=self.train_ratio, stratify=Y, random_state=self.seed) self.x_test, self.x_val, self.y_test, self.y_val = split_data( self.x_val_temp, self.y_val_temp, train_size=self.test_val_ratio, stratify=self.y_val_temp, random_state=self.seed) # Convert labels to their numpy representations self.y_train = self.y_train.values self.y_val = self.y_val.values self.y_test = self.y_test.values bins = np.bincount(Y) class_bin = [] for i in range(0, self.class_number): class_bin.append(bins[i]) self.final_weights = [] if self.weight_policy == 0: max_number = 1 for i in range(0, self.class_number): self.final_weights.append(float(max_number) / class_bin[i]) elif self.weight_policy == 1: max_number = max(class_bin) for i in range(0, self.class_number): self.final_weights.append(float(max_number) / class_bin[i]) elif self.weight_policy == 2: n_samples = Y.count() n_classes = bins.shape[0] for i in range(0, self.class_number - 1): self.final_weights.append( float(n_samples) / (n_classes * class_bin[i])) features = self.read_features(data=0) self.x_train = features[0] self.y_train = features[1] features = self.read_features(data=1) self.x_test = features[0] self.y_test = features[1] features = self.read_features(data=2) self.x_val = features[0] self.y_val = features[1] assert len(self.x_train) == len(self.y_train) assert len(self.x_val) == len(self.y_val) assert len(self.x_test) == len(self.y_test) self.to_pytorch_tensors() return