def multi_label_split( X: np.array, y: np.array ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=1 - TRAIN_SIZE) X_valid, y_valid, X_test, y_test = iterative_train_test_split( X_test, y_test, test_size=TEST_SIZE / (TEST_SIZE + VALID_SIZE)) return X_train, y_train, X_valid, y_valid, X_test, y_test
def stratify_split_train_val_test(sids, labels): '''skmultilearn needs X as nsampel x ndim inputs''' np.random.seed(286501567) inputs = np.expand_dims(sids, axis=-1) from skmultilearn.model_selection import iterative_train_test_split X, y, X_test, y_test = iterative_train_test_split(inputs, labels, test_size=0.5) X_train, y_train, X_val, y_val = iterative_train_test_split(X, y, test_size=0.2) return X_train.squeeze(), X_val.squeeze(), X_test.squeeze()
def stratifyval(): labels = [ 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ] totallabels = [ 'Path', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ] df1 = pd.read_csv('train_tmp.csv') df2 = pd.read_csv('test_tmp.csv') df3 = pd.read_csv('val_tmp.csv') df = pd.concat([df1, df2, df3]) totalX = df["Path"].values totalY = df[labels].values totalX = np.expand_dims(totalX, axis=1) print("PRE ITERATIVE") X_train, y_train, X_test, y_test = iterative_train_test_split( totalX, totalY, test_size=0.2) X_train, y_train, X_val, y_val = iterative_train_test_split(X_train, y_train, test_size=0.2) print("WRITING Train") dfTotal2 = pd.DataFrame(columns=totallabels) dfTotal2['Path'] = X_test.flatten() dfTotal2[labels] = y_test dfTotal2.to_csv("test_tmp2.csv") print("WRITING Train") dfTotal3 = pd.DataFrame(columns=totallabels) dfTotal3['Path'] = X_val.flatten() dfTotal3[labels] = y_val dfTotal3.to_csv("val_tmp2.csv") print("WRITING Test") dfTotal4 = pd.DataFrame(columns=totallabels) dfTotal4['Path'] = X_train.flatten() dfTotal4[labels] = y_train dfTotal4.to_csv("train_tmp2.csv")
def setup_wiki(self): mat = sio.loadmat(self.root / 'wiki' / 'POS.mat') self.metric = 'MicroF1' self.num_nodes = 4777 self.num_classes = 40 adj_t = mat['network'].tocoo() self.adj_t = SparseTensor(row=torch.LongTensor(adj_t.row), col=torch.LongTensor(adj_t.col), sparse_sizes=(self.num_nodes, self.num_nodes)) if self.make_edge_index: row = self.adj_t.storage.row() col = self.adj_t.storage.col() self.edge_index = torch.stack((row, col), dim=0) self.y = torch.from_numpy(mat['group'].todense()).float() idx = torch.arange(self.y.shape[0]).view(-1, 1) train_idx, _, test_idx, _ = iterative_train_test_split(idx, self.y, test_size=0.1) self.split_idx = { 'train': train_idx.view(-1), 'valid': test_idx.view(-1), 'test': test_idx.view(-1) } self.criterion = torch.nn.BCEWithLogitsLoss( ) # for multi-label classification
def main(): start = time.time() input_filename = "../data/frwiki_discussions_categories_processed.csv/part-00000-381f0f76-28b9-4da9-8cb0-96958b5ea46e-c000.csv" df = load_data(input_filename) print("Load took:", (time.time() - start)) df = preprocess_data(df) encoder = MultiLabelBinarizer() labels_df = pd.DataFrame(encoder.fit_transform(df.categories.values)) X_train, y_train, X_test, y_test = iterative_train_test_split( df.text.values.reshape(-1, 1), labels_df.values, test_size=0.5) train_dataset = make_dataset(X_train, y_train) test_dataset = make_dataset(X_test, y_test) model = compile_model(verbose=True) model, history = train(model, train_dataset, test_dataset, lr=1e-3) output_dir = "../tf_model/frwikipedia_10_categories_classifier" model.save(output_dir) end = time.time() print("Complete training took :", (end - start))
def run_test3(normas, n_jobs=1): # Corpus e labels: corpus = [norma['TextoPreProcessado'] for norma in normas] labels = [norma['AssuntoGeral'] for norma in normas] # Obtém X e y: X = corpus mlb = MultiLabelBinarizer() y = mlb.fit_transform(labels) # Faz shuffle: X, y = shuffle(X, y, random_state=42) # Vectorizer X = TfidfVectorizer(min_df=20, max_df=0.5).fit_transform(X).toarray() # TrainTestSplit: X_train, y_train, X_test, y_test = iterative_train_test_split( X, y, test_size=0.5) # Classifcador: clf = MLPClassifier(hidden_layer_sizes=(150), activation='relu') clf.fit(X_train, y_train) # Prevê y_pred = clf.predict(X_test)
def split_data(inkml_data): print("split data") uis_to_symbols = create_ui_symbol_map(inkml_data) symbol_index_map, size = create_symbol_index_map(uis_to_symbols) print("size:", size) array = np.zeros(size) uis = [] for ui in uis_to_symbols: print(ui) indices = np.zeros(size) uis.append(ui) for symbol in uis_to_symbols[ui]: indices[symbol_index_map[symbol]] = uis_to_symbols[ui][symbol] array = np.vstack((array, indices)) array = np.delete(array, (0), axis=0) le = pp.LabelEncoder() encoded_uis = le.fit_transform(uis) x1 = np.zeros(len(encoded_uis)) print(len(encoded_uis)) x = np.column_stack((np.array(encoded_uis), x1)) print(array.shape) x_train, y_train, x_test, y_test = iterative_train_test_split(x, array, test_size=(1 / 3)) x_train = x_train[:, 0].astype(int) x_test = x_test[:, 0].astype(int) training_ids = le.inverse_transform(x_train) testing_ids = le.inverse_transform(x_test) print(type(training_ids), type(testing_ids)) return set(training_ids.tolist()), set(testing_ids.tolist())
def ECC_test_2_fold(data, label, random_state=3071980, ensemble=5): # data set information n_label = label.shape[1] # split training and test data set X_train, y_train, X_test, y_test = iterative_train_test_split( np.matrix(data), np.matrix(label), test_size=0.5) X_train = pd.DataFrame(X_train, columns=data.columns) X_test = pd.DataFrame(X_test, columns=data.columns) y_train = pd.DataFrame(y_train, columns=label.columns) y_test = pd.DataFrame(y_test, columns=label.columns) performance_df_all = pd.DataFrame() for j in range(2): X_test, X_train = X_train, X_test y_test, y_train = y_train, y_test # ensemble y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape), columns=y_test.columns, index=y_test.index) y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape), columns=y_test.columns, index=y_test.index) for i in range(ensemble): # training # print("--- start training ---\n") classifier_list, training_time, order = naiveBayes_multi_label_training( X_train, y_train) # testing # print("--- start testing ---\n") y_predict, y_prob, testing_time = naiveBayes_multi_label_testing( X_test, n_label, classifier_list, order) y_predict.columns = label.columns[order] y_prob.columns = label.columns[order] y_predict = y_predict[label.columns] y_prob = y_prob[label.columns] y_pred_ensemble = y_pred_ensemble + y_predict y_prob_ensemble = y_prob_ensemble + y_prob y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5) * 1).astype('int') y_prob_ensemble = y_prob_ensemble / ensemble # evaluation performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test) performance_df = pd.DataFrame.from_dict(performance, orient='index') #performance_df_all.index = performance_df.index performance_df_all = pd.concat([performance_df_all, performance_df], axis=1) return performance_df_all
def BR_test(data, label, dataPath, random_state=3071980): # data set information n_label = label.shape[1] n_attr = data.shape[1] n_instance = data.shape[0] avg_label_per_instance = label.sum(axis=1).mean() # split training and test data set X_train, y_train, X_test, y_test = iterative_train_test_split( np.matrix(data), np.matrix(label), test_size=0.5) X_train = pd.DataFrame(X_train, columns=data.columns) X_test = pd.DataFrame(X_test, columns=data.columns) y_train = pd.DataFrame(y_train, columns=label.columns) y_test = pd.DataFrame(y_test, columns=label.columns) # training classifier_list, training_time = naiveBayes_multi_label_training_BR( X_train, y_train) # testing y_predict, y_prob, testing_time = naiveBayes_multi_label_testing_BR( X_test, n_label, classifier_list) y_predict.columns = label.columns return y_predict, y_test
def main(): if len(sys.argv) == 3: database_filepath, model_filepath = sys.argv[1:] print('Loading data...\n DATABASE: {}'.format(database_filepath)) X, Y, category_names = load_data(database_filepath) # This part of the script is modified from the original in order to use # the iterative_train_test_split function from skmultilearn X = X.values X = X[:, np.newaxis] X_train, Y_train, X_test, Y_test = iterative_train_test_split( X, np.array(Y), test_size=0.3) X_train = np.squeeze(X_train) X_test = np.squeeze(X_test) print('Building model...') model = build_model() print('Training model...') model.fit(X_train, Y_train) print('Evaluating model...') evaluate_model(model, X_test, Y_test, category_names) print('Saving model...\n MODEL: {}'.format(model_filepath)) save_model(model, model_filepath) print('Trained model saved!') else: print('Please provide the filepath of the disaster messages database '\ 'as the first argument and the filepath of the pickle file to '\ 'save the model to as the second argument. \n\nExample: python '\ 'train_classifier.py ../data/DisasterResponse.db classifier.pkl')
def train_and_predict(X, y, train_ratio=0.2, n_trials=10, random_state=None): micro, macro, c, std, f1, f1_std = [], [], [], [], [], [] for i in range(n_trials): np.random.seed(random_state) X_train, y_train, X_test, y_test = iterative_train_test_split( X, y, test_size=1 - train_ratio) clf = MultiOutputClassifier( LogisticRegressionCV(max_iter=1e4, class_weight='balanced')) with warnings.catch_warnings(): warnings.simplefilter("ignore") clf.fit(X_train, y_train.A) y_pred = np.array(clf.predict_proba(X_test))[:, :, 1].T mi = roc_auc_score(y_test.A, y_pred, average="micro") ma = roc_auc_score(y_test.A, y_pred, average="macro") y_pred = clf.predict(X_test) f = f1_score(y_test.A, y_pred, average="micro") std.append(mi) f1.append(f) f1_std.append(f) micro.append(mi) macro.append(ma) c.append( np.mean([estimator.C_.mean() for estimator in clf.estimators_])) return np.mean(micro), np.mean(macro), np.mean(c), np.std(std), np.mean( f1), np.std(f1_std)
def load_and_split(data, dev_size): """Loads a data file and uses stratified sampling to create splits.""" samples, ids, tags = dict(), [], [] with open(data, "r") as f: for line in f: sample = json.loads(line.strip()) samples[sample["id"]] = sample ids.append(sample["id"]) tags.append(sample["tags"]) tag2id = utils.get_tag_mappings(tags) id2tag = {v: k for k, v in tag2id.items()} tags = np.array([utils.tags_to_onehot(tag, tag2id) for tag in tags]) ids = np.array(ids).reshape(-1, 1) train_ids, _, dev_ids, _ = iterative_train_test_split(ids, tags, dev_size) with open("train.tmp", "w") as f: for id in train_ids: f.write(f"{json.dumps(samples[id.item()], ensure_ascii=False)}\n") with open("dev.tmp", "w") as f: for id in dev_ids: f.write(f"{json.dumps(samples[id.item()], ensure_ascii=False)}\n") dataset = load_dataset('json', data_files={ "train": "train.tmp", "dev": "dev.tmp" }) return dataset, tag2id, id2tag
def two_fold(methods, data, label, dataset, ensemble=1, ordering="random", structure="bayes_net", lead=False): # setup savePath = "../code/temp/" + methods.__name__ + "/" + dataset + "/" if not os.path.exists(savePath): os.makedirs(savePath) print("running", methods.__name__) print("setting:", ensemble, ordering, structure, lead) performance_df_all = pd.DataFrame() if label.shape[1] >= 100: time = 20 else: time = 20 for j in range(time): print("time:", j) X_train, y_train, X_test, y_test = iterative_train_test_split( np.matrix(data), np.matrix(label), test_size=0.5) X_train = pd.DataFrame(X_train, columns=data.columns) X_test = pd.DataFrame(X_test, columns=data.columns) y_train = pd.DataFrame(y_train, columns=label.columns) y_test = pd.DataFrame(y_test, columns=label.columns) """ X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5) X_train.reset_index(inplace=True, drop=True) X_test.reset_index(inplace=True, drop=True) y_train.reset_index(inplace=True, drop=True) y_test.reset_index(inplace=True, drop=True)""" for i in range(2): X_test, X_train = X_train, X_test y_test, y_train = y_train, y_test # test if methods.__name__ == "BayesianClassifierChain_NB": pred_ensemble, prob_ensemble = BayesianClassifierChain_NB( X_train, X_test, y_train, y_test, savePath, ensemble, ordering, structure, lead) elif methods.__name__ == "ClassifierChain_NB": pred_ensemble, prob_ensemble = ClassifierChain_NB( X_train, X_test, y_train, y_test, savePath, ensemble) else: raise BaseException("no such a function") performance = evaluation(pred_ensemble, prob_ensemble, y_test) performance_df = pd.DataFrame.from_dict(performance, orient='index') performance_df_all = pd.concat( [performance_df_all, performance_df], axis=1) performance_df_all.columns = list(range(time * 2)) return performance_df_all
def multilearn_iterative_train_test_split(features, labels, cols, test_size=0.3): from skmultilearn.model_selection import iterative_train_test_split train_features, train_labels, test_features, test_labels = iterative_train_test_split( np.array(features), labels, test_size=test_size) # print(type(train_features)) train_features = pd.DataFrame(train_features, columns=cols) test_features = pd.DataFrame(test_features, columns=cols) return train_features, test_features, train_labels, test_labels
def split_train_test(df, train_size=70, test_size=30, folder='../data/action_db'): print('Splitting into train-test: ' + str(train_size) + '-' + str(test_size)) def _convert_index_to_subj_emotion(y): print(y) y = [subjects[y[0]-1], EMOTIONS[y[1]]] return y if 'paco' not in folder: EMOTIONS = ['ang', 'fea', 'hap', 'neu', 'sad', 'unt'] subjects = ['1m', '2f', '3m', '4f', '5m', '6f', '7f', '8m', '9f', '10f', '11f', '12m', '13f', '14f', '15m', '16f', '17f', '18f', '19m', '20f', '21m', '22f', '23f', '24f', '25m', '26f', '27m', '28f', '29f'] subjects = ['0'*(4-len(subject)) + subject for subject in subjects] else: # emotions = ['ang', 'fea', 'hap', 'sad', 'neu'] EMOTIONS = ['ang', 'hap', 'sad', 'neu'] subjects = ['ale', 'ali', 'alx', 'amc', 'bar', 'boo', 'chr', 'dav', 'din', 'dun', 'ele', 'emm', 'gra', 'ian', 'jan', 'jen', 'jua', 'kat', 'lin', 'mac', 'mar', 'mil', 'ndy', 'pet', 'rac', 'ros', 'she', 'shi', 'ste', 'vas'] df = df.reindex(np.random.permutation(df.index)) emotions_dict = {EMOTIONS[i]: int(i) for i in range(len(EMOTIONS))} subjects_dict = {subjects[i]: int(i + 1) for i in range(len(subjects))} y_classes = ['subject', 'emotion'] y = df[y_classes] y['subject'] = y['subject'].map(subjects_dict) y['emotion'] = y['emotion'].map(emotions_dict) y = np.array(y) X = df.drop(y_classes, axis=1) columns = X.columns.values X = np.array(X) columns = np.append(columns, y_classes) assert len(X) == len(y) X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=test_size/(test_size+train_size)) y_train = np.apply_along_axis(_convert_index_to_subj_emotion, 1, y_train) y_test = np.apply_along_axis(_convert_index_to_subj_emotion, 1, y_test) train_df = pd.DataFrame(data=np.hstack((X_train, y_train)), columns=columns) test_df = pd.DataFrame(data=np.hstack((X_test, y_test)), columns=columns) print('Counts in train set') for emotion in EMOTIONS: print(emotion + ": " + str(len(train_df[train_df['emotion'] == emotion]))) print('Counts in test set') for emotion in EMOTIONS: print(emotion + ": " + str(len(test_df[test_df['emotion'] == emotion]))) print('Saving training data...') train_df.to_csv(folder + '/training/train_data.csv') # train_df.to_hdf(folder + '/training/train_data.h5', key='df', mode='w') print('Saving test data...') test_df.to_hdf(folder + '/test/test_data.h5', key='df', mode='w') print('Done.') return train_df, test_df
def train_test_split(df, df_true, test_size=0.25): # iterative_train_test_split is only deterministic if we call this first np.random.seed(RANDOM_SEED) # iterative_train_test_split expects a matrix, whereas CountVectorizer # needs an iterable over strings X_train, y_train, X_test, y_test = iterative_train_test_split( df.text.to_frame().values, df_true.values, test_size=0.25) X_train, X_test = X_train[:, 0], X_test[:, 0] return X_train, y_train, X_test, y_test
def split_data(self): print( 'Splitting data into training and validation set to train DNNs...') X_train, y_train, X_val, y_val = iterative_train_test_split( self._X_train, self._y_train, test_size=0.35) self._X_train = X_train self._y_train = y_train self._X_val = X_val self._y_val = y_val print('Train data:', X_train.shape) print('Train labels:', y_train.shape) print('Val data:', X_val.shape) print('Val labels:', y_val.shape)
def stratify(): labels = [ 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ] totallabels = [ 'Path', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ] df = pd.read_csv('mimic_chex.csv') totalX = df["Path"].values totalY = df[labels].values totalX = np.expand_dims(totalX, axis=1) print("PRE ITERATIVE") X_train, y_train, X_test, y_test = iterative_train_test_split( totalX, totalY, 0.2) print(X_train.shape) print("COMBINATION") df = pd.DataFrame({ 'train': Counter( str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row), 'test': Counter( str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row) }).T.fillna(0.0) print(df.to_string()) print("WRITING Train") dfTotal2 = pd.DataFrame(columns=totallabels) dfTotal2['Path'] = X_train.flatten() dfTotal2[labels] = y_train dfTotal2.to_csv("train_draft.csv") print("WRITING Test") dfTotal2 = pd.DataFrame(columns=totallabels) dfTotal2['Path'] = X_test.flatten() dfTotal2[labels] = y_test dfTotal2.to_csv("test.csv")
def stratify(): df = pd.read_csv(PATH_RADIO6, usecols=['TEXT', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity','Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices'], engine='python' ) totalX = df["TEXT"].values totalY = df[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax', 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']].values totalX = np.expand_dims(totalX, axis=1) print("PRE ITERATIVE") X_train, y_train, X_test, y_test = iterative_train_test_split(totalX, totalY, 0.2) print("COMBINATION") df = pd.DataFrame({ 'train': Counter( str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row), 'test': Counter( str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row) }).T.fillna(0.0) print(df.to_string()) print("WRITING Train") dfTotal2 = pd.DataFrame(columns=["TEXT", 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax', 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']) dfTotal2['TEXT'] = X_train.flatten() dfTotal2[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax', 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']] = y_train dfTotal2.to_csv("train.csv") print("WRITING Test") dfTotal2 = pd.DataFrame(columns=["TEXT", 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax', 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']) dfTotal2['TEXT'] = X_test.flatten() dfTotal2[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax', 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']] = y_test dfTotal2.to_csv("test.csv")
def fragment_train_test_split(data: list, labels: list, test_size=0.2, shuffle=True): """Treating the data as multilabel to do iterative train_test_split arguments: data: list of objects from the `Fragment` dataclass labels: list of used labels text_size: part of the data to be used as `test` set shuffle: if True, shuffle the dataset before splitting. """ def label_to_int(l, labels): return labels.index(l) if shuffle: random.shuffle(data) # For train_test_split we use an X containing only the index of the data items X = [[index] for index in range(len(data))] y = [] num_labels = len(labels) # One hot encoding of multiple categories for d in data: label_list = list({label_to_int(f.label, labels) for f in d['fragments']}) y.append([1 if l in label_list else 0 for l in range(num_labels)]) nX = np.array(X) ny = np.array(y) # We use iteractive_train_test_split to split with an even division for # labels. # # NOTE! # multilearn returns for iterative_train_test_split are: # X_train, y_train, X_test, y_test # unlike sklearn train_test_split which returns # X_train, X_test, y_train, y_test X_train, _, X_test, _ = iterative_train_test_split(nX, ny, test_size=test_size) X_train = np.squeeze(X_train) X_test = np.squeeze(X_test) print("Train:", X_train.shape, "Test:", X_test.shape) train = [] test = [] for key in list(X_train): train.append(data[key]) for key in list(X_test): test.append(data[key]) return train, test
def splitting_data(self): ''' A dropout layer for sparse input data, note that this layer can not be applied to the output of SparseInputDenseLayer because the output of SparseInputDenseLayer is dense. ''' data_random = self.data.sample(frac=1) X = data_random[['guid', 'txt']].to_numpy() Y = data_random[[ 'Safety', 'CleanlinessView', 'Information', 'Service', 'Comfort', 'PersonnelCard', 'Additional' ]].to_numpy() self.X_train, self.y_train, self.X_test, self.y_test = iterative_train_test_split( X, Y, test_size=self.test_size) return self.X_train, self.y_train, self.X_test, self.y_test
def Iterative_Stratifier_Split(df, ratio=0.15): img_ids = df.image_id.unique() print("Creating one-hot labels ...") labels = np.zeros((len(img_ids), len(df.columns) - 1), dtype=np.uint8) for i, img_id in enumerate(tqdm(img_ids)): # for i, img_id in enumerate(img_ids): aa = df.loc[df.image_id == img_id, :].to_numpy()[0, 1:] # print(aa.shape) labels[i] = aa print("Done!") print("Spliting dataset ...") train_image_id, train_class_id, test_image_id, test_class_id = iterative_train_test_split( img_ids.reshape(-1, 1), labels, test_size=ratio) train_df = df[df.image_id.map(lambda x: x in train_image_id.reshape(-1))] test_df = df[df.image_id.map(lambda x: x in test_image_id.reshape(-1))] print("Done!\n") return train_df, test_df
def test_stratified_split(): y = np.array([[0, 1], [0, 3], [1, 3], [4, 5], [4, 3], [4, 4], [4, 4]]) X = np.array([[i, i + 1] for i in range(len(y))]) assert len(X) == len(y) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] class_counts = np.bincount(y_indices) X_train, y_train, X_test, y_test = iterative_train_test_split( X, y, test_size=0.5) print('X_train') print(X_train) print('y_train') print(y_train) print('X_test') print(X_test) print('y_test') print(y_test)
def multilayer_sample(edges: pd.DataFrame, layer_ids: List[int], hidden_ratio: float = 0.5, random_state: Optional[int] = None) -> MultiLayerSplit: """ Split multilayer network into hidden and observed parts for specified layers. First, split nodes at random, then split edges accordingly. Usage example: >>> from fao_data import load_all_layers >>> edges = load_all_layers() >>> sample = multilayer_sample(edges, [42, 123], random_state=0) >>> sample.print_summary() Summary of random split. Layer ids: [42, 123] total observed hidden obs.ratio nodes 136 67 69 0.492647 edges 1034 248 786 0.239845 layer 42 773 179 594 0.231565 layer 123 261 69 192 0.264368 """ edges = filter_by_layer(edges, layer_ids) nodes = sorted(node_set(edges)) node_layers = _node_layer_incidence(edges, nodes, layer_ids) np.random.seed(random_state) nodes_observed, _, nodes_hidden, _ = \ iterative_train_test_split(np.array(nodes).reshape(-1, 1), node_layers, test_size=hidden_ratio) nodes_observed = nodes_observed.flatten().tolist() nodes_hidden = nodes_hidden.flatten().tolist() edges_observed, edges_hidden = partition_into_observed_and_hidden(edges, nodes_hidden) split = MultiLayerSplit( layer_ids=layer_ids, node_index=index_elements(nodes), observed=GraphData(edges_observed, nodes_observed), hidden=GraphData(edges_hidden, nodes_hidden), full=GraphData(edges, nodes) ) return split
def main(): data_df = pd.read_csv('./data/train.csv') defects_df = [] for i in range(0, len(data_df), 4): defi = {} defi['ImageId_ClassId'] = data_df.loc[i, 'ImageId_ClassId'][:-2] defi['1'] = int(not pd.isnull(data_df.loc[i, 'EncodedPixels'])) defi['2'] = int(not pd.isnull(data_df.loc[i + 1, 'EncodedPixels'])) defi['3'] = int(not pd.isnull(data_df.loc[i + 2, 'EncodedPixels'])) defi['4'] = int(not pd.isnull(data_df.loc[i + 3, 'EncodedPixels'])) defects_df.append(defi) defects_df = pd.DataFrame(defects_df)[[ 'ImageId_ClassId', '1', '2', '3', '4' ]] # defects_df.to_csv('./data/defect_types.csv', index=False) Xd = np.expand_dims(np.array(range(len(defects_df))), 1) X_train, y_train, X_test, y_test = iterative_train_test_split( Xd, defects_df[['1', '2', '3', '4']].to_numpy(), test_size=0.2) print("Train set size = {}, Test set size = {}".format( len(X_train), len(X_test))) print( pd.DataFrame({ 'train': Counter( str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row), 'test': Counter( str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row) }).T.fillna(0.0)) defects_df['is_valid'] = False defects_df.loc[X_test[:, 0], 'is_valid'] = True defects_df.to_csv('./data/split.csv', index=False)
def train_test_split(self,meth="random",prob=0.8,nf=5): if(meth=="stratified"): ##Use scikit-multilearn train, _, test, _ = iterative_train_test_split(np.arange(0,self.n).reshape(self.n,1), self.targets.values, test_size = 1-prob) self.idx_train=train[:,0] self.idx_test=test[:,0] elif(meth=="random"): s=np.random.choice(a=2,size=self.n,p=[prob,1-prob]) self.idx_train=np.where(s==0)[0] self.idx_test=np.where(s==1)[0] elif(meth=="kfold"): indices=np.arange(self.n) np.random.shuffle(indices) foldsize=int(np.round(self.n/nf)) for i in range(nf): self.kfolds.append({'fold':i+1, 'idx_test':indices[np.arange(start=i*foldsize,stop=(i+1)*foldsize).tolist()], 'idx_train':indices[np.arange(start=0,stop=i*foldsize).tolist()+np.arange(start=(i+1)*foldsize,stop=self.n).tolist()] }) elif(meth=="bootstrap"): indices=np.arange(self.n) else: ###do not split self.idx_train=np.arange(0,self.n) self.idx_test=np.arange(0,self.n)
def split_train_cv( data_frame: pd.DataFrame, frac: float = 0.9, y=None, # Only for stratified, computes necessary split **kwargs): """split_train_cv :param data_frame: :type data_frame: pd.DataFrame :param frac: :type frac: float """ if kwargs.get('mode', None) == 'urbansed': # Filenames are DATA_-1 DATA_-2 etc data_frame.loc[:, 'id'] = data_frame.groupby( data_frame['filename'].str.split('_').apply( lambda x: '_'.join(x[:-1]))).ngroup() sampler = np.random.permutation(data_frame['id'].nunique()) num_train = int(frac * len(sampler)) train_indexes = sampler[:num_train] cv_indexes = sampler[num_train:] train_data = data_frame[data_frame['id'].isin(train_indexes)] cv_data = data_frame[data_frame['id'].isin(cv_indexes)] del train_data['id'] del cv_data['id'] elif kwargs.get('mode', None) == 'stratified': # Use statified sampling from skmultilearn.model_selection import iterative_train_test_split index_train, _, index_cv, _ = iterative_train_test_split( data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac) train_data = data_frame[data_frame.index.isin(index_train.squeeze())] cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] else: # Simply split train_test train_data = data_frame.sample(frac=frac, random_state=10) cv_data = data_frame[~data_frame.index.isin(train_data.index)] return train_data, cv_data
def run_exam(self): log_dir = os.path.join( self.config.tb_path, datetime.datetime.now().strftime("%Y%m%d-%H%M%s")) tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir) adapter_size = None # use None to fine-tune all of BERT model = create_model(self.config, adapter_size=adapter_size) X_train, y_train, X_test, y_test = iterative_train_test_split( self.train_x, self.train_y, test_size=self.config.test_ratio) if self.config.load_model_name: model.load_weights( os.path.join(self.config.epoch_model_path, self.config.load_model_name)) model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), batch_size=self.config.batch_size, shuffle=True, epochs=self.config.num_epochs, initial_epoch=self.config.initial_epoch, callbacks=[ create_learning_rate_scheduler( max_learn_rate=1e-5, end_learn_rate=1e-7, warmup_epoch_count=self.config.warmup_epoch_count, total_epoch_count=self.config.num_epochs), tensorboard_callback, MyCustomCallback(self.config) ]) model.save_weights(os.path.join(self.config.epoch_model_path, 'sentiments.h5'), overwrite=True) self.bot.send_msg('{} train is done'.format(self.config.train_name))
def stratify_val(): df = pd.read_csv( "physionet.org/files/mimic-cxr-jpg/2.0.0/train_multi2_v3.csv", usecols=[ 'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding', 'Enlarged ' 'Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]) totalX = df[['Path_compr', 'Indication', 'Impression', 'Findings']].values totalY = df[[ 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]].values print(totalX.shape) print(totalY.shape) totalX = np.expand_dims(totalX, axis=1) print("PRE ITERATIVE") X_train, y_train, X_test, y_test = iterative_train_test_split( totalX, totalY, 0.2) print("COMBINATION") df = pd.DataFrame({ 'train': Counter( str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row), 'test': Counter( str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row) }).T.fillna(0.0) print(df.to_string()) X_train = np.squeeze(X_train, axis=1) X_test = np.squeeze(X_test, axis=1) print(X_train.shape) print(y_train.shape) print(X_test.shape) print(y_test.shape) print("WRITING Train") dfTotal2 = pd.DataFrame(columns=[ 'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]) print(dfTotal2.shape) dfTotal2[['Path_compr', 'Indication', 'Impression', 'Findings']] = pd.DataFrame(X_train) dfTotal2[[ 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]] = y_train with open("physionet.org/files/mimic-cxr-jpg/2.0.0/train_multi_v3.csv", mode='w', newline='\n') as f: dfTotal2.to_csv(f, sep=",", float_format='%.2f', index=False, line_terminator='\n', encoding='utf-8') print("WRITING Test") dfTotal2 = pd.DataFrame(columns=[ 'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]) dfTotal2[['Path_compr', 'Indication', 'Impression', 'Findings']] = pd.DataFrame(X_test) dfTotal2[[ 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices' ]] = y_test with open("physionet.org/files/mimic-cxr-jpg/2.0.0/val_multi_v3.csv", mode='w', newline='\n') as f: dfTotal2.to_csv(f, sep=",", float_format='%.2f', index=False, line_terminator='\n', encoding='utf-8')
def split(self): """Split a dataset into training and testing (or validation)""" import argparse, sys parser = self.new_parser('split') # prefixing the argument with -- means it's optional parser.add_argument('input', type=str, help='Path to input image HDF5 file') parser.add_argument('train_output', type=str, help='Path to output HDF5 file (training)') parser.add_argument('test_output', type=str, help='Path to output HDF5 file (testing)') parser.add_argument( '--h5keys', default='images,labels', help='Name of datasets in input and HDF5 files (comma-separated)') parser.add_argument( '--copy_other_keys', action='store_true', help='Copy all other keys from input file into output verbatim') parser.add_argument('--random_seed', default=0, type=int, help='Random seed to use for determining split') parser.add_argument( '--test_size', default=0.25, help= 'Size of test size. If <= 1, proportion of dataset to use. Otherwise number of samples.' ) parser.add_argument('--stratify_key', default=None, help='Key to use for stratification labels') args = parser.parse_args(sys.argv[2:]) keys = args.h5keys.split(',') test_size = float(args.test_size) if test_size > 1: # if not a proportion, should be an integer test_size = int(args.test_size) dataset = H5Dataset(args.input, key=keys) stratify = None if args.stratify_key is not None: # load all the labels with h5py.File(args.input, 'r') as f: stratify = np.array(f[args.stratify_key]) if len(stratify.shape) == 2: if stratify.shape[1] == 1: stratify = stratify.squeeze(1) elif len(stratify.shape) > 2: raise Exception( f"Dimension of dataset {args.stratify_key} cannot be more than two" ) if stratify is None or len(stratify.shape) == 1: from sklearn.model_selection import train_test_split ix_train, ix_test = train_test_split(range(len(dataset)), test_size=test_size, random_state=args.random_seed, stratify=stratify) else: from skmultilearn.model_selection import iterative_train_test_split # set random seeds manually import random random.seed(args.random_seed) np.random.seed(args.random_seed) ix_train, y_train, ix_test, y_test = iterative_train_test_split( np.arange(len(dataset), dtype=np.uint32).reshape(-1, 1), stratify, test_size=test_size) dstrain = SubsetDataset(dataset, ix_train) dstest = SubsetDataset(dataset, ix_test) write_dataset_h5(dstrain, args.train_output, key=keys) with h5py.File(args.train_output, 'a') as f: self._stamp_dataset(f[keys[0]], args) if args.copy_other_keys: self.copy_other_keys(args.input, args.train_output, keys) write_dataset_h5(dstest, args.test_output, key=keys) with h5py.File(args.test_output, 'a') as f: self._stamp_dataset(f[keys[0]], args) if args.copy_other_keys: self.copy_other_keys(args.input, args.test_output, keys)