def split_data(self, dataset, y_label, train_size = .8, random_state = 0, shuffle = True): """ Splitting data into train and test split, has appropriate imports for different compute modes. CPU compute - Uses sklearn, we manually filter y_label column in the split call GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally. Parameters ---------- dataset : dataframe The dataframe on which we wish to perform the split y_label : string The name of the column (not the series itself) train_size : float The size for the split. Takes values between 0 to 1. random_state : int Useful for running reproducible splits. shuffle : binary Specifies if the data must be shuffled before splitting. Returns ---------- X_train : dataframe The data to be used for training. Has same type as input dataset. X_test : dataframe The data to be used for testing. Has same type as input dataset. y_train : dataframe The label to be used for training. Has same type as input dataset. y_test : dataframe The label to be used for testing. Has same type as input dataset. duration : float The time it took to perform the split """ self.log_to_file('\n> Splitting train and test data') start_time = time.perf_counter() with PerfTimer() as split_timer: if 'CPU' in self.compute_type: X_train, X_test, y_train, y_test = sklearn_train_test_split(dataset.loc[:, dataset.columns != y_label], dataset[y_label], train_size = train_size, shuffle = shuffle, random_state = random_state) elif 'GPU' in self.compute_type: if 'single' in self.compute_type: X_train, X_test, y_train, y_test = cuml_train_test_split(X = dataset, y = y_label, train_size = train_size, shuffle = shuffle, random_state = random_state) elif 'multi' in self.compute_type: X_train, X_test, y_train, y_test = dask_train_test_split(dataset, y_label, train_size = train_size, shuffle = False, # shuffle not available for dask_cudf yet random_state = random_state) self.log_to_file(f'\n\tX_train shape and type{X_train.shape} {type(X_train)}') self.log_to_file( f'\n\tSplit completed in {split_timer.duration}') return X_train, X_test, y_train, y_test, split_timer.duration
def split_data(self, dataset, y_label, train_size=.8, random_state=0, shuffle=True): """ split dataset into train and test subset NOTE: assumes the first column of the dataset is the classification labels ! in the case of sklearn, we manually filter this column in the split call ! in the case of cuml, the filtering happens internally """ self.log_to_file('\tsplitting train and test data') start_time = time.perf_counter() with PerfTimer() as split_timer: if 'CPU' in self.compute_type: X_train, X_test, y_train, y_test = sklearn_train_test_split( dataset.loc[:, dataset.columns != y_label], dataset[y_label], train_size=train_size, shuffle=shuffle, random_state=random_state) elif 'GPU' in self.compute_type: X_train, X_test, y_train, y_test = cuml_train_test_split( X=dataset, y=y_label, train_size=train_size, shuffle=shuffle, random_state=random_state) self.log_to_file(f'\t> split completed in {split_timer.duration}') return X_train, X_test, y_train, y_test, split_timer.duration
def split_data(self, dataset, y_label, train_size=.8, random_state=0, shuffle=True): self.log_to_file('\n\t splitting train and test data') start_time = time.perf_counter() with PerfTimer() as split_timer: if 'CPU' in self.compute_type: X_train, X_test, y_train, y_test = sklearn_train_test_split( dataset, dataset[y_label], train_size=train_size, shuffle=shuffle, random_state=random_state) elif 'GPU' in self.compute_type: X_train, X_test, y_train, y_test = cuml_train_test_split( X=dataset, y=y_label, train_size=train_size, shuffle=shuffle, random_state=random_state) self.log_to_file(f'\t split completed in {split_timer.duration}') return X_train, X_test, y_train, y_test, split_timer.duration
def validation_split(self, annotations_df): """Takes annotations DataFrame and return indexes that corresponds to validation and training data. Parameters ---------- annotations_df : pandas.DataFrame DataFrame with data annotations. Returns ------- train_idx : list of int indexes of the dataframe that corresponds to training set. val_idx : list of int indexes of the dataframe that corresponds to validation set. This indexes only point to ISIC2020 data. """ isic_2020_df = annotations_df[annotations_df.source == "ISIC20"] external_data = annotations_df[annotations_df.source != "ISIC20"] indexes = isic_2020_df.index.tolist() train_idx, val_idx = sklearn_train_test_split( indexes, stratify=isic_2020_df.target, test_size=self.validation_fraction, shuffle=True, random_state=1234, ) train_idx += external_data.index.tolist() return train_idx, val_idx
def train_test_split(df, frac): """ df is a dataframe A Train/Test split function for a dataframe and returns both the Training and Testing sets. frac referes to the precent of data you would like to set aside for training. """ train, val = sklearn_train_test_split(df, train_size=frac, random_state=42) return train, val
def train_test_split(labeled_sections): # split people speakers = labeled_sections.original_spk.unique() train_speakers, val_speakers = sklearn_train_test_split(speakers, test_size=0.1) # split samples train_samples, val_samples = sklearn_train_test_split( labeled_sections[labeled_sections.original_spk.isin(train_speakers)], test_size=0.15) def label_train_val(row): nonlocal val_samples nonlocal val_speakers if row['original_spk'] in val_speakers or row['sample'] in val_samples: return "val" else: return "train" labeled_sections['set'] = labeled_sections.apply(label_train_val, axis=1)
def train_test_split(dataset, test_ratio, shuffle=False, transform=None, test_transform=None, random_state=None): if isinstance(transform, Compose): transform = InputTransform(transform) if isinstance(test_transform, Compose): test_transform = InputTransform(test_transform) n = len(dataset) train_indices, test_indices = sklearn_train_test_split( list(range(n)), test_size=test_ratio, shuffle=shuffle, random_state=random_state) ds_train = Subset(dataset, train_indices, transform) ds_test = Subset(dataset, test_indices, test_transform) return ds_train, ds_test
def split_rows(data_matrix, train_size=0.75, columnToSplitOn='pat_id', random_state=0): # from sklearn.model_selection import GroupShuffleSplit # print data_matrix.shape # train_inds, evalu_inds = next( # GroupShuffleSplit(n_splits=2, test_size=1-train_size, random_state=random_state) # .split(data_matrix, groups=data_matrix[columnToSplitOn]) # ) # print len(train_inds) + len(evalu_inds) # # data_matrix_train, data_matrix_evalu = data_matrix.iloc[train_inds], data_matrix.iloc[evalu_inds] # # pat_ids_train = data_matrix_train['pat_id'].values.tolist() # pat_ids_evalu = data_matrix_evalu['pat_id'].values.tolist() all_possible_ids = sorted(set( data_matrix[columnToSplitOn].values.tolist())) train_ids, test_ids = sklearn_train_test_split(all_possible_ids, test_size=1. - train_size, random_state=random_state) data_matrix_train = data_matrix[data_matrix[columnToSplitOn].isin( train_ids)].copy() # y_train = pd.DataFrame(train_matrix.pop(outcome_label)) # X_train = train_matrix data_matrix_evalu = data_matrix[data_matrix[columnToSplitOn].isin( test_ids)].copy() # y_test = pd.DataFrame(test_matrix.pop(outcome_label)) # X_test = test_matrix pat_ids_train = data_matrix_train['pat_id'].values.tolist() pat_ids_evalu = data_matrix_evalu['pat_id'].values.tolist() assert (set(pat_ids_train) & set(pat_ids_evalu)) == set([]) assert data_matrix_train.shape[0] + data_matrix_evalu.shape[ 0] == data_matrix.shape[0] return data_matrix_train, data_matrix_evalu
def validation_split(self, annotations_df): """Takes annotations DataFrame and return indexes that corresponds to validation and training data. Parameters ---------- annotations_df : pandas.DataFrame DataFrame with data annotations. Returns ------- train_idx : list of int indexes of the dataframe that corresponds to training set. val_idx : list of int indexes of the dataframe that corresponds to validation set. """ indexes = list(range(len(annotations_df))) train_indexes, val_indexes = sklearn_train_test_split( indexes, test_size=self.validation_fraction, random_state=1234) return list(train_indexes), list(val_indexes)
def train_test_split(df, x_cols, y_col, test_percent, isStandardize=False, isNormalize=False): # print(df.head(5)) df_X = df[x_cols] df_y = df[y_col] X_train, X_test, y_train, y_test = sklearn_train_test_split(df_X, df_y, test_size=test_percent, random_state=None) if isStandardize: X_train = standardize(X_train) X_test = standardize(X_test) return X_train, y_train, X_test, y_test if isNormalize: X_train = normalize(X_train) X_test= normalize(X_test) return X_train, y_train, X_test, y_test return X_train, y_train, X_test, y_test
def train_test_split(examples, labels, test_size=0.1, verbose=0): if verbose: print("Train/Test split ") print(100-test_size*100, "% of training data") print(test_size*100, "% of testing data") # split data into train and test sets train_examples, test_examples, train_labels, test_labels = sklearn_train_test_split( examples, labels, test_size=0.1, random_state=42, shuffle=True ) # transform train and test examples to their corresponding one-hot representations train_users = train_examples[:, 0] test_users = test_examples[:, 0] train_items = train_examples[:, 1] test_items = test_examples[:, 1] # Final training and test set x_train = np.array(list(zip(train_users, train_items))) x_test = np.array(list(zip(test_users, test_items))) y_train = train_labels y_test = test_labels if verbose: print() print('number of training examples : ', x_train.shape) print('number of training labels : ', y_train.shape) print('number of test examples : ', x_test.shape) print('number of test labels : ', y_test.shape) return (x_train, x_test), (y_train, y_test)
def _data_preprocess(self, inference=False, infer_dataset=None, validate=False, transform_input="standardize", transform_output="standardize", test_size=0.1, shuffle=False, kwargs={}): if not inference: np_dataset = self._dataset.data_to_numpy() data_column_names = self._dataset.data_columns else: np_dataset = infer_dataset.data_to_numpy() if not validate: data_column_names = [c[0] for c in infer_dataset.data_columns] else: data_column_names = infer_dataset.data_columns self.input_data_continuous = [] self.input_data_categorical = [] self.input_data_descriptors = [] self.output_data = [] if not inference: self.data_transformation_dict = {} # this loop makes sure that the inputs are always in the same order and only # data with the same column names as in the domain is considered for v in self._domain.variables: v_in_dataset = False for i, c_name in enumerate(data_column_names): if c_name == v.name: v_in_dataset = True if not v.is_objective: if v.variable_type == "continuous": # Standardize continuous inputs tmp_cont_inp = np.asarray(np_dataset[:, i], dtype=float) if not inference: tmp_cont_inp, _reduce, _divide = self._transform_data( data=tmp_cont_inp, transformation_type=transform_input) self.data_transformation_dict[v.name] = [ _reduce, _divide ] else: tmp_cont_inp, _, _ = self._transform_data( data=tmp_cont_inp, reduce=self.data_transformation_dict[ v.name][0], divide=self.data_transformation_dict[ v.name][1]) self.input_data_continuous.append(tmp_cont_inp) elif v.variable_type == "descriptors" or ( v.variable_type == "categorical" and self._cat_to_descr == True): tmp_descr_inp = [] for ent in np_dataset[:, i]: tmp_descr_inp.append( v.ds.loc[[ent], :].values[0].tolist()) tmp_descr_inp = np.asarray(tmp_descr_inp) for i in range(len(tmp_descr_inp[0])): if not inference: tmp_descr_inp[:, i], _reduce, _divide = self._transform_data( data=tmp_descr_inp[:, i], transformation_type= transform_input) self.data_transformation_dict[ v.ds.data_columns[i]] = [ _reduce, _divide ] else: tmp_descr_inp[:, i], _, _ = self._transform_data( data=tmp_descr_inp[:, i], reduce=self. data_transformation_dict[ v.ds.data_columns[i]] [0], divide=self. data_transformation_dict[ v.ds.data_columns[i]] [1]) self.input_data_descriptors.append( np.asarray(tmp_descr_inp)) elif v.variable_type == "categorical": # create one-hot tensor for categorical inputs one_hot_enc = sklearn.preprocessing.OneHotEncoder( categories=[v.levels]) tmp_disc_inp_one_hot = one_hot_enc.fit_transform( np_dataset[:, i].reshape(-1, 1)).toarray() self.input_data_categorical.append( np.asarray(tmp_disc_inp_one_hot)) else: raise TypeError( "Unknown variable type: {}.".format( v.variable_type)) elif not inference: if v.variable_type == "continuous": tmp_cont_out = np.asarray(np_dataset[:, i], dtype=float) if not inference: tmp_cont_out, _reduce, _divide = self._transform_data( data=tmp_cont_out, transformation_type=transform_output) self.data_transformation_dict[v.name] = [ _reduce, _divide ] self.output_data.append(tmp_cont_out) elif v.variable_type == "categorical": raise TypeError( "{} is a categorical variable. Regressor not trainable for categorical outputs." .format(v.name)) elif v.variable_type == "descriptors": raise TypeError( "{} is a descriptor variable. Regressor not trainable for descriptor outputs." .format(v.name)) else: raise TypeError( "Unknown variable type: {}.".format( v.variable_type)) elif inference and v.is_objective: v_in_dataset = True if v_in_dataset == False: raise ValueError( "Variable {} defined in the domain is missing in the given dataset." .format(v.name)) self.input_data_continuous = np.asarray( self.input_data_continuous).transpose() if len(self.input_data_categorical) != 0: self.input_data_categorical = np.concatenate( [one_hot for one_hot in self.input_data_categorical], axis=1) if len(self.input_data_descriptors) != 0: self.input_data_descriptors = np.concatenate( [d for d in self.input_data_descriptors], axis=1) self.output_data = np.asarray(self.output_data).transpose() # Set up training and test data if not inference: final_np_dataset = np.concatenate([ inp for inp in [ self.input_data_continuous, self.input_data_descriptors, self.input_data_categorical, self.output_data ] if len(inp) != 0 ], axis=1) X, y = final_np_dataset[:, :-self. output_dim], final_np_dataset[:, -self. output_dim:] X_train, X_test, y_train, y_test = sklearn_train_test_split( X, y, test_size=test_size, shuffle=shuffle) return [X_train.astype(dtype=float), y_train.astype(dtype=float)], [ X_test.astype(dtype=float), y_test.astype(dtype=float) ] else: X = np.concatenate([ inp for inp in [ self.input_data_continuous, self.input_data_descriptors, self.input_data_categorical ] if len(inp) != 0 ], axis=1) return X.astype(dtype=float)
def preprocessing(data_df, is_training): ### Create embedding matrix def text_wordlist(text): """ Pre process and convert texts to a list of words :param text: text(sentence) :return: text """ text = str(text) text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = text.split() return text ### Prepare embedding vocab = dict() # word vocabulary inverse_vocab = ["<unk>"] word_to_vec = utils.WORD_2_VECTOR que_cols = ["question1", "question2"] def iter_over_df(df, que_cols, text_wordlist, stops, word_to_vec, vocab, inverse_vocab): """ Iterate over the questions :param df: dataframe :return: dataframe """ for index, row in df.iterrows(): for que in que_cols: que_to_vec = [] for word in text_wordlist(row[que]): # skip unwanted words if word in stops and word not in word_to_vec.vocab: continue if word not in vocab: vocab[word] = len(inverse_vocab) que_to_vec.append(len(inverse_vocab)) inverse_vocab.append(word) else: que_to_vec.append(vocab[word]) df.set_value(index, que, que_to_vec) return df stops = set(utils.STOPWORDS) data_df = iter_over_df(data_df, que_cols, text_wordlist, stops, word_to_vec, vocab, inverse_vocab) ### Build the embedding matrix def build_embed_matr(vocab, word_to_vec, embed_dim): embed_mat = 1 * np.random.randn(len(vocab) + 1, embed_dim) embed_mat[0] = 0 # to ignore padding for word, index in vocab.items(): if word in word_to_vec.vocab: embed_mat[index] = word_to_vec.word_vec(word) return embed_mat ### Zero padding def zero_padding_train(*args, Y_train=None, max_seq_len): data = [*args] for dataset, side in itertools.product(data, ["left", "right"]): dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_len) if is_training: assert X_train["left"].shape == X_train["right"].shape assert len(X_train["left"]) == len(Y_train) return data # Split to train validation if is_training: EMBEDDING_MATRIX = build_embed_matr(vocab, word_to_vec, utils.EMBEDDING_DIM) validation_size = int(0.1 * data_df.shape[0]) X = data_df[que_cols] Y = data_df["is_duplicate"] X_train, X_val, Y_train, Y_val = sklearn_train_test_split( X, Y, test_size=validation_size) del X, Y, validation_size, data_df, que_cols X_train = {"left": X_train.question1, "right": X_train.question2} X_val = {"left": X_val.question1, "right": X_val.question2} Y_train = Y_train.values Y_val = Y_val.values X_TRAIN, X_VAL = zero_padding_train(X_train, X_val, Y_train=Y_train, max_seq_len=utils.MAX_SEQ_LEN) Y_TRAIN = Y_train Y_VAL = Y_val del X_train, X_val, Y_train, Y_val return X_TRAIN, X_VAL, Y_TRAIN, Y_VAL, EMBEDDING_MATRIX else: data_df = {"left": data_df.question1, "right": data_df.question2} return zero_padding_train(data_df, Y_train=None, max_seq_len=utils.MAX_SEQ_LEN)[0]
def train_test_split(X, y, test_size, random_state=random.randint(1, 1000)): return sklearn_train_test_split(X, y, test_size=test_size, random_state=random_state)