def on_dataset_change(args): with dispatcher.get_lock('bootenvs'): if args['operation'] == 'create': bootenvs.propagate(args, convert_bootenv) if args['operation'] == 'delete': logger.warn(args) for i in args['ids']: pool, dataset = split_dataset(i) if pool != boot_pool_name: continue realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if ds: bootenvs.remove(ds['id']) if args['operation'] == 'update': for i in args['entities']: pool, dataset = split_dataset(i['id']) if pool != boot_pool_name: continue realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if not ds: continue nickname = i.get('properties.beadm:nickname.value', realname) if nickname and nickname != ds['id']: bootenvs.rename(ds['id'], nickname) bootenvs.put(nickname, convert_bootenv(i))
def make_dataset(name): inputs, psi, reference = load('{}.data'.format(name)) # train/test split trainset, testset = utils.split_dataset( # train/test split utils.shuffle((inputs, psi)), # shuffle dataset ratio=0.7) # test/valid split testset, validset = utils.split_dataset(testset, ratio=0.5) return (trainset, testset, validset)
def transfer(source_model, target_path, frozen_layer): # load data of branch df = pd.read_pickle(target_path) # create dataframe with netto sales, month, weekday, year df = pd.DataFrame(data=df.values, index=df.index, columns=['netto']) df = df.assign(month=df.index.month) df = df.assign(weekday=df.index.weekday) df = df.assign(year=df.index.year) # split into train and test train, test = split_dataset(df.values, 365) # prepare input data for branch n_input = 365 train_x, train_y = to_supervised(train, n_input, 365) # load pre-trained model of source branch as base model base_model = load_model(source_model) # freeze specific layers of base model for layer in base_model.layers[:frozen_layer]: layer.trainable = False print("frozen layers: " + str(frozen_layer)) # compile the model base_model.compile(loss='mse', optimizer='adam') # fit base_model with new data from branch n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1] input_data = [train_x[:, :, i].reshape((train_x.shape[0], n_timesteps, 1)) for i in range(n_features)] base_model.fit(input_data, train_y, epochs=20, batch_size=16, verbose=0) # evaluate fitted model mape = evaluate_model(train, test, base_model) return mape
def get_feats_from_csv_in_partitions(): """ Extract the original features that are distributed in the dataset. Features are splitted according with the config.yaml file. """ conf = utils.get_config() rows = [ row for row in utils.load_csv() if utils.check_filter(row, conf['filters']) ] train_rows, valid_rows, test_rows = utils.split_dataset( rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], [] prefixes = ['t_', 'i_', 's_'] # Feature names' prefixes datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)] out = [] for X, y, rows in datasets: for row in rows: X.append([ float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0 ]) y.append(int(row['classification'] == 'Malign')) out.extend((np.asarray(X), np.asarray(y))) return out
def get_feats_from_imagenet_in_partitions(): conf = utils.get_config() imagenet_data = os.path.join(conf['models_path'], 'decafnet', 'imagenet.decafnet.epoch90') imagenet_meta = os.path.join(conf['models_path'], 'decafnet', 'imagenet.decafnet.meta') net = DecafNet(imagenet_data, imagenet_meta) rows = utils.get_filtered_rows() sets = utils.split_dataset(rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) feats = [] ys = [] for s in sets: X = np.zeros((len(s), 4096)) y = np.zeros(len(s)) for i, row in enumerate(s): try: log.info('processing %i-th of %i' % (i, len(s))) origin, im = utils.extract_roi(row, 30, True) scores = net.classify(np.asarray(im), center_only=True) X[i] = net.feature('fc7_cudanet_out') y[i] = utils.is_positive(row) except: continue feats.append(X) ys.append(y) return feats[0], ys[0], feats[1], ys[1], feats[2], ys[2]
def get_feats_in_partitions(): """ Extracts features from all dataset and split them in train validation and test sets """ conf = utils.get_config() paths = utils.get_paths() rows = utils.load_csv() filters = conf['filters'] region_size = conf['region_size'] region_stride = conf['region_stride'] filtered_rows = [ row for row in rows if utils.check_filter(row, conf['filters']) ] train_rows, valid_rows, test_rows = utils.split_dataset( filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) conv = get_fprop_fn(False) print 'Getting features from train...' X_train = get_feats_from_rows(train_rows, conv, conf['stride']) print 'Getting features from valid...' X_valid = get_feats_from_rows(valid_rows, conv, conf['stride']) print 'Getting features from test...' X_test = get_feats_from_rows(test_rows, conv, conf['stride']) y_train = [row['classification'] == 'Malign' for row in train_rows] y_valid = [row['classification'] == 'Malign' for row in valid_rows] y_test = [row['classification'] == 'Malign' for row in test_rows] return X_train, y_train, X_valid, y_valid, X_test, y_test
def initialize_train_test(self, train_ratio=0.75, sample=True, full=False): ''' This function initializes the training and the test sets. The training set is initialized to a fraction of the original dataset given by the input parameter train_ratio, while the test set will be initialized to the remaining part of the original dataset. Notice that the training and the test sets are copies of the original dataset, so every changes made on those datasets will not affect the original one. The default value for the parameter train_ratio is 0.75. If the input parameter sample is True, then the training set is built sampling rows from the original dataset. Otherwise, it is built taking the first part of the dataset. Default value for sample is True. If the input parameter full is True, then the values of the other parameters are ignored and both the training and the test sets are initialized to be exact copies of the full original one. ''' if full: self.train = self.dataset.copy() self.test = self.dataset.copy() else: if sample: self.train, self.test = utils.split_dataset_sample( self.dataset, train_ratio) else: self.train, self.test = utils.split_dataset( self.dataset, train_ratio)
def get_feats_from_imagenet_in_partitions(): conf = utils.get_config() imagenet_data = os.path.join( conf['models_path'], 'decafnet', 'imagenet.decafnet.epoch90') imagenet_meta = os.path.join( conf['models_path'], 'decafnet', 'imagenet.decafnet.meta') net = DecafNet(imagenet_data, imagenet_meta) rows = utils.get_filtered_rows() sets = utils.split_dataset( rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) feats = [] ys = [] for s in sets: X = np.zeros((len(s), 4096)) y = np.zeros(len(s)) for i, row in enumerate(s): try: log.info('processing %i-th of %i' % (i, len(s))) origin, im = utils.extract_roi(row, 30, True) scores = net.classify(np.asarray(im), center_only=True) X[i] = net.feature('fc7_cudanet_out') y[i] = utils.is_positive(row) except: continue feats.append(X) ys.append(y) return feats[0], ys[0], feats[1], ys[1], feats[2], ys[2]
def get_feats_in_partitions(): """ Extracts features from all dataset and split them in train validation and test sets """ conf = utils.get_config() paths = utils.get_paths() rows = utils.load_csv() filters = conf['filters'] region_size = conf['region_size'] region_stride = conf['region_stride'] filtered_rows = [ row for row in rows if utils.check_filter(row, conf['filters'])] train_rows, valid_rows, test_rows = utils.split_dataset( filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) conv = get_fprop_fn(False) print 'Getting features from train...' X_train = get_feats_from_rows( train_rows, conv, conf['stride']) print 'Getting features from valid...' X_valid = get_feats_from_rows( valid_rows, conv, conf['stride']) print 'Getting features from test...' X_test = get_feats_from_rows( test_rows, conv, conf['stride']) y_train = [row['classification'] == 'Malign' for row in train_rows] y_valid = [row['classification'] == 'Malign' for row in valid_rows] y_test = [row['classification'] == 'Malign' for row in test_rows] return X_train, y_train, X_valid, y_valid, X_test, y_test
def xgb_boost_model(): df_all = pickle.load(open("../output/features/basic_features.pkl", 'r')) test_ind = df_all.relevance == -1 test_data = df_all[test_ind] train_data = df_all[~test_ind] test_data = test_data.drop(['relevance'], axis=1) le = preprocessing.LabelEncoder() le.fit(train_data['relevance']) ids = test_data['id'] train, test, hold_out = utils.split_dataset(train_data) relevant_columns =['title_similarity', 'product_desc_similarity', 'title_similarity_common', 'product_desc_similarity_common', 'description_length', 'search_length'] dTrain = xgb.DMatrix(train['X'][relevant_columns], label=train['Y']) dTest = xgb.DMatrix(test['X'][relevant_columns], label=test['Y']) dHold_out = xgb.DMatrix(hold_out['X'][relevant_columns], label=hold_out['Y']) dSubmit = xgb.DMatrix(test_data[relevant_columns]) param = {'bst:max_depth':5 , 'bst:eta':0.05, 'silent':1, 'objective':'reg:linear', 'eval_metric':'rmse'} evallist = [(dTest, 'eval'), (dTrain, 'train')] numRound = 200 bst = xgb.train(param, dTrain, numRound, evallist) predHoldout = bst.predict(dHold_out) print "Mean square hold out error ", utils.rmse(hold_out['Y'], predHoldout) predY = bst.predict(dSubmit) utils.debug_model(hold_out['X'], hold_out['Y'], predY)
def run_network(window, model=None, save_model=False, show_plot=False): start_time = time.time() print('loading and prepare data set...') data = read_dataset('../datasets/internet-traffic-data-5minutes.csv') X_train, y_train, X_test, y_test, mean, std = split_dataset( data, window, ratio=0.90, standardize=True) print('number of training samples ', len(y_train)) print('number of test samples ', len(y_test)) if not model: print('initialize model...') model = compile_model( hidden_neurons=25, loss_fn='mse', input_dim=sum(1 for x in window if x), activation_fn='tanh') print('model ', model.summary()) print('train model...') early_stopping = EarlyStopping(monitor='val_loss', patience=2) model.fit(X_train, y_train, nb_epoch=500, validation_split=0.1, callbacks=[early_stopping]) print('make predictions...') prediction = model.predict(X_test).flatten() if show_plot: plot_result(prediction, y_test, mean, std) print('mase = ', mase(y_train, y_test, prediction)) if save_model: store_model(model) print('totoal duration: {:.2f} seconds'.format(time.time() - start_time))
def main(): model = create_model() model.summary() # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Expand data dimension for kernel to convolve over X_train = np.expand_dims(X_train, axis=2) # (None, 46, 1) X_test = np.expand_dims(X_test, axis=2) # (None, 46, 1) # create model model = KerasClassifier(build_fn=create_model, verbose=0) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test, Y_test, scorer) Y_pred_grid_search = np.squeeze(Y_pred_grid_search) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def fit(self, X, y): """fit tree in X, y""" try: # only for numpy arrays for now if not isinstance(X, np.ndarray): X = np.array(X) if not isinstance(y, np.ndarray): y = np.array(y) if self.max_depth is None: # The absolute maximum depth would be N−1, where N is the number of training samples. # https://stats.stackexchange.com/questions/65893/maximal-depth-of-a-decision-tree self.max_depth = X.shape[0] - 1 if self.n_clasess is None: self.n_clasess = len(set(y)) assert (X.shape[0] > self.min_samples_split) if not (self.max_depth is None): assert (self.max_depth > 0) gain, column_idx, threshold = self._find_best_split(X, y) X_left, X_right, y_left, y_right = split_dataset(X, y, column=column_idx, t=threshold) self.node = Node(feature_idx=column_idx, threshold=threshold, labels=y, gain=gain) # build left and right child for max self.node.left = DecisionTree(criterion=self.criterion, debug=self.debug, max_depth=self.max_depth - 1) # if base class for random forest -> remove that attribute self.node.left.x_columns = self.x_columns self.node.left.n_clasess = self.n_clasess self.node.left.fit(X_left, y_left) self.node.right = DecisionTree(criterion=self.criterion, debug=self.debug, max_depth=self.max_depth - 1) # if base class for random forest -> remove that attribute self.node.right.x_columns = self.x_columns self.node.right.n_clasess = self.n_clasess self.node.right.fit(X_right, y_right) # not the best idea, it is impossible to check for other conditions with assert :( except AssertionError: self.node = Node() self._predict_from_leaf(y) # test info about predictions if self.debug: print("Is Last Node: ", self.node.is_last) print("Data shapes: ", X.shape, y.shape) print("Y: ", y) print("Prediction: ", self.node.node_prediction) print("Predict proba: ", self.node.node_prob_prediction) return self
def on_dataset_change(args): if args['operation'] == 'create': with dispatcher.get_lock('bootenvs'): boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool') bootenvs.propagate(args, lambda x: convert_bootenv(boot_pool, x)) if args['operation'] == 'delete': for i in args['ids']: pool, dataset = split_dataset(i) if pool != boot_pool_name: continue with dispatcher.get_lock('bootenvs'): realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if ds: bootenvs.remove(ds['id']) if args['operation'] == 'update': boot_pool = None for i in args['entities']: pool, dataset = split_dataset(i['id']) if pool != boot_pool_name: continue with dispatcher.get_lock('bootenvs'): realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if not ds: continue nickname = q.get(i, 'properties.beadm:nickname.value', realname) if nickname and nickname != ds['id']: bootenvs.rename(ds['id'], nickname) if not boot_pool: boot_pool = dispatcher.call_sync( 'zfs.pool.get_boot_pool') bootenvs.put(nickname, convert_bootenv(boot_pool, i))
def test_split_dataset(self): filename = '/path/to/dataset.tsv' train_filename = '/path/to/dataset_train.tsv' dev_filename = '/path/to/dataset_dev.tsv' read_data = 'Rock n Roll is a risk. You rick being ridiculed.\tDo you like rock music?\n' \ 'Rock n Roll is a risk.\tDo you like rock?' open_ = patch('utils.open', mock_open(read_data=read_data)).start() open_.return_value.__iter__.return_value = read_data.split('\n') writer = patch('csv.writer').start() split_dataset(filename) open_.assert_has_calls([ call(filename), call(train_filename, 'w'), call(dev_filename, 'w') ], any_order=True) writer.assert_called_with(open_.return_value, delimiter='\t') writer.return_value.writerow.assert_called() patch.stopall()
def on_dataset_change(args): if args['operation'] == 'create': with dispatcher.get_lock('bootenvs'): boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool') bootenvs.propagate(args, lambda x: convert_bootenv(boot_pool, x)) if args['operation'] == 'delete': for i in args['ids']: pool, dataset = split_dataset(i) if pool != boot_pool_name: continue with dispatcher.get_lock('bootenvs'): realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if ds: bootenvs.remove(ds['id']) if args['operation'] == 'update': boot_pool = None for i in args['entities']: pool, dataset = split_dataset(i['id']) if pool != boot_pool_name: continue with dispatcher.get_lock('bootenvs'): realname = dataset.split('/')[-1] ds = bootenvs.query(('realname', '=', realname), single=True) if not ds: continue nickname = q.get(i, 'properties.beadm:nickname.value', realname) if nickname and nickname != ds['id']: bootenvs.rename(ds['id'], nickname) if not boot_pool: boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool') bootenvs.put(nickname, convert_bootenv(boot_pool, i))
def do_experiment_for_one_year(run_path, year, config): """Performs the specified experiments for one year.""" X, Y = load_dataset(year, shuffle=config['experiment']['shuffle_data']) if config['experiment']['type'] == 'single': X_train, Y_train, X_test, Y_test = split_dataset(X, Y, config['experiment']['test_share']) results = perform_one_experiment(X_train, Y_train, X_test, Y_test, config) elif config['experiment']['type'] == 'cv': results = perform_cv_runs(X, Y, config) results_path = os.path.join(run_path, 'results_year{}.pkl'.format(year)) with open(results_path, 'wb') as f: pickle.dump(results, f) show_results(results, year, **config['analysis'])
def fit(self, samples, labels): """Train the model with the samples and lables provided according to the parameters of the model.""" # Split into train and dev x_train, y_train, x_dev, y_dev = split_dataset(samples, labels, self.dev_share) # Create batch iterator if self.batch_iterator_type == 'normal': batch_iter = _batch_iter elif self.batch_iterator_type == 'oversample': batch_iter = _oversampling_batch_iter else: raise ValueError('{} is not a valid batch_iterator_type'.format( self.batch_iterator_type)) # Train model train_batch_nr = [] train_loss_val = [] dev_batch_nr = [] dev_loss_val = [] for i, (x, y) in enumerate( batch_iter(x_train, y_train, self.num_epochs, self.batch_size)): # Train feed_dict = { self.graph_nodes['x_input']: x, self.graph_nodes['y_input']: y, self.graph_nodes['dropout_keep_prob']: self.dropout_keep_prob } _, loss_val = self.sess.run( [self.graph_nodes['optimize'], self.graph_nodes['loss']], feed_dict=feed_dict) train_batch_nr.append(i) train_loss_val.append(loss_val) if i % self.evaluate_every_n_steps == 0: feed_dict = { self.graph_nodes['x_input']: x_dev, self.graph_nodes['y_input']: y_dev, self.graph_nodes['dropout_keep_prob']: 1. } loss_val = self.sess.run(self.graph_nodes['loss'], feed_dict=feed_dict) dev_batch_nr.append(i) dev_loss_val.append(loss_val) if self.plot_training: plt.plot(train_batch_nr, train_loss_val) plt.plot(dev_batch_nr, dev_loss_val) plt.show()
def pretrain_model(path): df = pd.read_pickle(path) # create dataframe with netto sales, month, weekday, year df = pd.DataFrame(data=df.values, index=df.index, columns=['netto']) df = df.assign(month=df.index.month) df = df.assign(weekday=df.index.weekday) df = df.assign(year=df.index.year) # split into train and test train, test = split_dataset(df.values, 365) # evaluate model and get scores model = evaluate_model(train, test) # save model model.save("models/" + path + ".h5") del model
def run(self, id, updated_fields): share = self.datastore.get_by_id('shares', id) remove_unchanged(updated_fields, share) path = self.dispatcher.call_sync('share.get_directory_path', share['id']) try: delete_config( path, '{0}-{1}'.format(share['type'], share['name']) ) except OSError: pass if 'type' in updated_fields: old_share_type = share['type'] new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type']) if share['target_type'] == 'DATASET': pool, dataset = split_dataset(share['target_path']) self.join_subtasks( self.run_subtask('volume.dataset.update', dataset, { 'permissions_type': new_share_type['perm_type'] }) ) share.update(updated_fields) self.join_subtasks(self.run_subtask('share.{0}.delete'.format(old_share_type), id)) self.join_subtasks(self.run_subtask('share.{0}.create'.format(updated_fields['type']), share)) else: self.join_subtasks(self.run_subtask('share.{0}.update'.format(share['type']), id, updated_fields)) if 'permissions' in updated_fields: path = self.dispatcher.call_sync('share.translate_path', id) self.join_subtasks(self.run_subtask('file.set_permissions', path, updated_fields['permissions'])) self.dispatcher.dispatch_event('share.changed', { 'operation': 'update', 'ids': [share['id']] }) updated_share = self.datastore.get_by_id('shares', id) path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id']) try: save_config( path, '{0}-{1}'.format(updated_share['type'], updated_share['name']), updated_share ) except OSError as err: self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err))))
def train_and_test(df, preds, seed): ''' Run a single trial: Shuffle df and split it into training and testing subsets Train a new model based on the training sets Test the model with testing set Add prediction data into preds array :param df: dataframe with full set of all available samples columns: id, cat1 (primary class), cat2 (secondary), title, titlen (claened title) :param preds: an array of predictions, each prediction is a dictionary cat: true category, pred: predicted category, conf: model confidence in its prediction (< 1.0), title: actual title of the chapter/sample :return: average testing accuracy ''' ret = {} # PREPS # randomly split the dataset df = utils.split_dataset( df, settings.CAT_DEPTH, settings.TRAIN_PER_CLASS_MIN, settings.TEST_PER_CLASS, settings.VALID_PER_CLASS, ) # TRAIN classifier = Classifier.from_name(settings.CLASSIFIER, seed) classifier.set_datasets(df, titles_out_path) classifier.train() df_test = classifier.df_test if settings.EVALUATE_TRAINING_SET: evaluate_model(classifier, classifier.df_train, display_prefix='TRAIN = ') accuracy = evaluate_model(classifier, df_test, preds, display_prefix='TEST = ') classifier_key = utils.get_exp_key(classifier) classifier.release_resources() return classifier_key, accuracy, classifier.df_train
def train(self, X_train, y_train): """ Train a Regression Forest using the given training data by training a number of Regression Trees each with a random sample of the training data. """ train_dataset = np.c_[(X_train, y_train)] for i in range(self.n_estimators): bootstrap_sample = train_dataset[np.random.choice( train_dataset.shape[0], size=int(round(train_dataset.shape[0] * self.split)), replace=True)] X_train, y_train, _, _ = split_dataset(bootstrap_sample, self.split, is_print=False) tree = RegressionTree(self.n_features, self.max_depth) tree.train(X_train, y_train) self.random_forest.append(tree)
def __init__(self, hparams=None): super().__init__() # Metrics self.train_acc = pl.metrics.Accuracy() self.val_acc = pl.metrics.Accuracy(compute_on_step=False) self.test_acc = pl.metrics.Accuracy(compute_on_step=False) # Hyperparameters self.hparams = hparams # Data self.train_data, self.test_data, self.val_data = split_dataset() # Model initialization multiplier = 2 if hparams["bidirectional"] else 1 self.word_vec_size = 300 * multiplier self.amount_classes = 7 self.rnn = nn.LSTM(input_size=self.word_vec_size, hidden_size=hparams["lstm_hidden_dim"], bidirectional=hparams["bidirectional"], num_layers=hparams["lstm_num_layers"]) # First FC layer modules = [ nn.Linear(self.word_vec_size, hparams["FC_layer_dims"][0]), nn.ReLU(), nn.Dropout(hparams["FC_dropouts"][0]) ] # Middle FC layers for i, (dim, d_rate) in enumerate( zip(hparams["FC_layer_dims"], hparams["FC_dropouts"])): if i == len(hparams["FC_layer_dims"]) - 1: continue # we reached the end modules.append(nn.Linear(dim, hparams["FC_layer_dims"][i + 1])) modules.append(nn.ReLU()) modules.append(nn.Dropout(d_rate)) # Last FC layer modules.append( nn.Linear(hparams["FC_layer_dims"][-1], self.amount_classes)) modules.append(nn.ReLU()) modules.append(nn.Dropout(hparams["FC_dropouts"][-1])) self.classifier = nn.Sequential(*modules)
def main(): # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test, scorer) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def main(args: Namespace): results_path = args.log_dir / str(datetime.now()) results_path.mkdir(exist_ok=True, parents=True) write_args(results_path, vars(args)) fix_seed(args.seed) height_model = 1000 width_model = 24 filenames_train, filenames_valid, filenames_test = split_dataset(args.data_root, args.fracs_dataset) train_set = SeisDataset(filenames_train, height_model=height_model, width_model=width_model, prob_aug=args.prob_aug) valid_set = SeisDataset(filenames_valid, height_model=height_model, width_model=width_model, prob_aug=args.prob_aug) test_set = SeisDataset(filenames_test, height_model=height_model, width_model=width_model, prob_aug=args.prob_aug) net = UNetFB() picker = Picker(net) stopper = Stopper(args.n_wrongs, args.delta_wrongs) trainer = Trainer(picker=picker, results_path=results_path, train_set=train_set, valid_set=valid_set, test_set=test_set, device=args.device, batch_size=args.batch_size, lr=args.lr, freq_valid=args.freq_valid, num_workers=args.num_workers, dt_ms=args.dt_ms, height_model=height_model, width_model=width_model, visual=args.visual, stopper=stopper, weights=torch.tensor(args.weights)) trainer.train(num_epoch=args.num_epoch)
def run_custom_classifier(weights=None): training_images = utils.load_images_from_dir(TRAINING_DIR) testing_images = utils.load_images_from_dir(TESTING_DIR) X_train, X_test, y_train, y_test = utils.split_dataset(training_images, testing_images, TRAINING_FILE, TESTING_FILE) #preprocess data X_train, y_train = preprocess(X_train, y_train) X_test, y_test = preprocess(X_test, y_test) #compile model model = custom_classifier() #train if no weights are passed in if weights == None: history = model.fit(X_train, y_train, epochs=50, verbose=1, validation_data=(X_test, y_test)) model.save_weights(os.path.join(WEIGHTS_DIR, 'custom_model.h5')) else: model.load_weights(weights) scores = model.evaluate(X_train, y_train, verbose=1) print('Digit 1 loss:', scores[1]) print('Digit 2 loss:', scores[2]) print('Digit 3 loss:', scores[3]) print('Digit 4 loss:', scores[4]) print('Digit 5 loss:', scores[5]) average_loss = sum([scores[i] for i in range(1, 6)]) / 5 print('Average loss:', average_loss) print('Digit 1 accuracy:', scores[6]) print('Digit 2 accuracy:', scores[7]) print('Digit 3 accuracy:', scores[8]) print('Digit 4 accuracy:', scores[9]) print('Digit 5 accuracy:', scores[10]) average_accuracy = sum([scores[i] for i in range(6, 11)]) / 5 print('Average accueracy:', average_accuracy)
def obtain_train_test(path, ifilename, ftrain_name, ftest_name, frac_test=0.2): df = pd.read_csv(os.path.join(path, ifilename)) df['date'] = pd.to_datetime(df['date']) if 'Unnamed: 0' in df.columns: df.drop('Unnamed: 0', axis=1, inplace=True) groups = list(df['inlet'].unique()) dftrain, dftest = split_dataset(df, frac_test=frac_test, groups=groups) #80/20% for all inlets if ((dftrain.shape[0] + dftest.shape[0]) != df.shape[0]): ValueError( 'The shapes of the resulting files are inconsistent with the shape of input table!' ) else: dftrain.to_csv(os.path.join(path, ftrain_name), index=False) dftest.to_csv(os.path.join(path, ftest_name), index=False) return dftrain, dftest
def create_loaders(dataset_name, dataset_train, dataset_val, dataset_test, train_size, val_size, batch_size, test_batch_size, cuda, num_workers, topk=None, noise=False): kwargs = {'num_workers': num_workers, 'pin_memory': True} if cuda else {} dataset_train, dataset_val = split_dataset(dataset_train, dataset_val, train_size, val_size) print('Dataset sizes: \t train: {} \t val: {} \t test: {}'.format( len(dataset_train), len(dataset_val), len(dataset_test))) train_loader = data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, **kwargs) val_loader = data.DataLoader(dataset_val, batch_size=test_batch_size, shuffle=False, **kwargs) test_loader = data.DataLoader(dataset_test, batch_size=test_batch_size, shuffle=False, **kwargs) train_loader.tag = 'train' val_loader.tag = 'val' test_loader.tag = 'test' return train_loader, val_loader, test_loader
def get_feats_from_csv_in_partitions(): """ Extract the original features that are distributed in the dataset. Features are splitted according with the config.yaml file. """ conf = utils.get_config() rows = [row for row in utils.load_csv() if utils.check_filter(row, conf['filters'])] train_rows, valid_rows, test_rows = utils.split_dataset( rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], [] prefixes = ['t_', 'i_', 's_'] # Feature names' prefixes datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)] out = [] for X, y, rows in datasets: for row in rows: X.append( [float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0]) y.append(int(row['classification'] == 'Malign')) out.extend((np.asarray(X), np.asarray(y))) return out
def exercicio2(): utils.print_header(2) data, classes = load_balance_scale(os.path.join(constants.DATA_DIR, constants.FILENAME_BALANCE_DATABASE)) print('Nb samples: {}'.format(data.shape[0])) gaussian_accuracy, discrete_accuracy, laplace_accuracy = [], [], [] np.random.seed(constants.SEED) for i in range(10): x_train, y_train, x_test, y_test = utils.split_dataset(data) params = {'mean': {}, 'std': {}, 'classes': classes, 'prior': {}, 'discrete_prob': {}} for c in classes: params['prior'][c] = sum(y_train == c) / float(x_train.shape[0]) x_c = x_train[y_train == c] params['mean'][c] = np.mean(x_c, axis=0) params['std'][c] = np.std(x_c, axis=0) params['discrete_prob'][c] = {} for j in range(x_c.shape[1]): params['discrete_prob'][c][j] = {} for k in [1, 2, 3, 4, 5]: params['discrete_prob'][c][j][k] = { 'sum': sum(x_c[:, j] == k), 'n': x_c.shape[0], } gaussian_pred = gaussian_predict(x_test, params) gaussian_accuracy.append(utils.accuracy(y_test, gaussian_pred)) discrete_pred = discrete_predict(x_test, params, laplace=False) discrete_accuracy.append(utils.accuracy(y_test, discrete_pred)) laplace_pred = discrete_predict(x_test, params, laplace=True) laplace_accuracy.append(utils.accuracy(y_test, laplace_pred)) print('a)') print('\tGaussian - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(gaussian_accuracy), np.std(gaussian_accuracy))) print('b)') print('\tDiscrete - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(discrete_accuracy), np.std(discrete_accuracy))) print('c)') print('\tDiscrete (with Laplace) - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(laplace_accuracy), np.std(laplace_accuracy))) exit()
def main(): # Building Phase data = import_data( "./dataset/crx_clean.data.txt" ) X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) clf_entropy = train_using_entropy(X_train, Y_train) # Operational Phase print("\n### SINGLE TRAIN-TEST SPLIT ###\n") Y_pred_entropy = prediction(X_test, clf_entropy) print_scores(Y_test, Y_pred_entropy) print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n") fold_scores = cv_with_entropy(X, Y) print("Cross Validate: ", fold_scores) print("Best F1_score: ", max(fold_scores)*100) scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer) print_scores(Y_test, Y_pred_grid_search)
def objective(params): nneurons = params['nneurons'] if params['season'] == 'full_day': window = create_window_array(params['window'], season_lag=288) if params['season'] == 'half_day': window = create_window_array(params['window'], season_lag=168) else: window = create_window_array(params['window']) if not any(window) or nneurons < 2: return {'status': STATUS_FAIL} X_train, y_train, *_ = split_dataset( data, window, ratio=0.90, standardize=True) model = compile_model( nneurons, input_dim=sum(1 for x in window if x), loss_fn='mse', activation_fn=params['activation_function']) hist = model.fit( X_train, y_train, nb_epoch=50, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=2)], verbose=0) return {'loss': hist.history['val_loss'][-1], 'status': STATUS_OK}
def main(args): #################### # Arguments gpu = args.gpu model_name = args.model initial_tree_sampling = args.initial_tree_sampling path_config = args.config data_augmentation = args.data_augmentation trial_name = args.name actiontype = args.actiontype max_epoch = args.max_epoch dev_size = args.dev_size # Check assert actiontype in ["train", "evaluate"] if actiontype == "train": assert max_epoch > 0 assert len(initial_tree_sampling.split("_")) == 3 for type_ in initial_tree_sampling.split("_"): assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"] assert initial_tree_sampling.split("_")[2] != "X" assert initial_tree_sampling.split("_")[1] != "RB2" assert initial_tree_sampling.split("_")[2] != "RB2" if trial_name is None or trial_name == "None": trial_name = utils.get_current_time() #################### # Path setting config = utils.Config(path_config) basename = "%s.%s.%s.aug_%s.%s" \ % (model_name, initial_tree_sampling, utils.get_basename_without_ext(path_config), data_augmentation, trial_name) if actiontype == "train": path_log = os.path.join(config.getpath("results"), basename + ".training.log") elif actiontype == "evaluate": path_log = os.path.join(config.getpath("results"), basename + ".evaluation.log") path_train = os.path.join(config.getpath("results"), basename + ".training.jsonl") path_valid = os.path.join(config.getpath("results"), basename + ".validation.jsonl") path_snapshot = os.path.join(config.getpath("results"), basename + ".model") path_pred = os.path.join(config.getpath("results"), basename + ".evaluation.ctrees") path_eval = os.path.join(config.getpath("results"), basename + ".evaluation.json") utils.set_logger(path_log) #################### # Random seed random_seed = trial_name random_seed = utils.hash_string(random_seed) random.seed(random_seed) np.random.seed(random_seed) cuda.cupy.random.seed(random_seed) #################### # Log so far utils.writelog("gpu=%d" % gpu) utils.writelog("model_name=%s" % model_name) utils.writelog("initial_tree_sampling=%s" % initial_tree_sampling) utils.writelog("path_config=%s" % path_config) utils.writelog("data_augmentation=%s" % data_augmentation) utils.writelog("trial_name=%s" % trial_name) utils.writelog("actiontype=%s" % actiontype) utils.writelog("max_epoch=%s" % max_epoch) utils.writelog("dev_size=%s" % dev_size) utils.writelog("path_log=%s" % path_log) utils.writelog("path_train=%s" % path_train) utils.writelog("path_valid=%s" % path_valid) utils.writelog("path_snapshot=%s" % path_snapshot) utils.writelog("path_pred=%s" % path_pred) utils.writelog("path_eval=%s" % path_eval) utils.writelog("random_seed=%d" % random_seed) #################### # Data preparation begin_time = time.time() train_dataset = dataloader.read_rstdt("train", relation_level="coarse-grained", with_root=False) test_dataset = dataloader.read_rstdt("test", relation_level="coarse-grained", with_root=False) vocab_word = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt")) vocab_postag = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "postags.vocab.txt")) vocab_deprel = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "deprels.vocab.txt")) if data_augmentation: external_train_dataset = dataloader.read_ptbwsj_wo_rstdt( with_root=False) # Remove documents with only one leaf node external_train_dataset = utils.filter_dataset( external_train_dataset, condition=lambda data: len(data.edu_ids) > 1) end_time = time.time() utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time)) #################### # Hyper parameters word_dim = config.getint("word_dim") postag_dim = config.getint("postag_dim") deprel_dim = config.getint("deprel_dim") lstm_dim = config.getint("lstm_dim") mlp_dim = config.getint("mlp_dim") n_init_epochs = config.getint("n_init_epochs") negative_size = config.getint("negative_size") batch_size = config.getint("batch_size") weight_decay = config.getfloat("weight_decay") gradient_clipping = config.getfloat("gradient_clipping") optimizer_name = config.getstr("optimizer_name") utils.writelog("word_dim=%d" % word_dim) utils.writelog("postag_dim=%d" % postag_dim) utils.writelog("deprel_dim=%d" % deprel_dim) utils.writelog("lstm_dim=%d" % lstm_dim) utils.writelog("mlp_dim=%d" % mlp_dim) utils.writelog("n_init_epochs=%d" % n_init_epochs) utils.writelog("negative_size=%d" % negative_size) utils.writelog("batch_size=%d" % batch_size) utils.writelog("weight_decay=%f" % weight_decay) utils.writelog("gradient_clipping=%f" % gradient_clipping) utils.writelog("optimizer_name=%s" % optimizer_name) #################### # Model preparation cuda.get_device(gpu).use() # Initialize a model utils.mkdir(os.path.join(config.getpath("data"), "caches")) path_embed = config.getpath("pretrained_word_embeddings") path_caches = os.path.join( config.getpath("data"), "caches", "cached." + os.path.basename(path_embed) + ".npy") if os.path.exists(path_caches): utils.writelog("Loading cached word embeddings ...") initialW = np.load(path_caches) else: initialW = utils.read_word_embedding_matrix(path=path_embed, dim=word_dim, vocab=vocab_word, scale=0.0) np.save(path_caches, initialW) if model_name == "spanbasedmodel": # Span-based model w/ template features template_feature_extractor = models.TemplateFeatureExtractor( dataset=train_dataset) utils.writelog("Template feature size=%d" % template_feature_extractor.feature_size) if actiontype == "train": for template in template_feature_extractor.templates: dim = template_feature_extractor.template2dim[template] utils.writelog("Template feature #%s %s" % (dim, template)) model = models.SpanBasedModel( vocab_word=vocab_word, vocab_postag=vocab_postag, vocab_deprel=vocab_deprel, word_dim=word_dim, postag_dim=postag_dim, deprel_dim=deprel_dim, lstm_dim=lstm_dim, mlp_dim=mlp_dim, initialW=initialW, template_feature_extractor=template_feature_extractor) elif model_name == "spanbasedmodel2": # Span-based model w/o template features model = models.SpanBasedModel2(vocab_word=vocab_word, vocab_postag=vocab_postag, vocab_deprel=vocab_deprel, word_dim=word_dim, postag_dim=postag_dim, deprel_dim=deprel_dim, lstm_dim=lstm_dim, mlp_dim=mlp_dim, initialW=initialW) else: raise ValueError("Invalid model_name=%s" % model_name) utils.writelog("Initialized the model ``%s''" % model_name) # Load pre-trained parameters if actiontype != "train": serializers.load_npz(path_snapshot, model) utils.writelog("Loaded trained parameters from %s" % path_snapshot) model.to_gpu(gpu) #################### # Decoder preparation decoder = decoders.IncrementalCKYDecoder() #################### # Initializer preparation sampler = treesamplers.TreeSampler(initial_tree_sampling.split("_")) #################### # Training / evaluation if actiontype == "train": with chainer.using_config("train", True): if dev_size > 0: # Training with cross validation train_dataset, dev_dataset = utils.split_dataset( dataset=train_dataset, n_dev=dev_size, seed=None) with open( os.path.join(config.getpath("results"), basename + ".valid_gold.ctrees"), "w") as f: for data in dev_dataset: f.write("%s\n" % " ".join(data.nary_sexp)) else: # Training with the full training set dev_dataset = None if data_augmentation: train_dataset = np.concatenate( [train_dataset, external_train_dataset], axis=0) train(model=model, decoder=decoder, sampler=sampler, max_epoch=max_epoch, n_init_epochs=n_init_epochs, negative_size=negative_size, batch_size=batch_size, weight_decay=weight_decay, gradient_clipping=gradient_clipping, optimizer_name=optimizer_name, train_dataset=train_dataset, dev_dataset=dev_dataset, path_train=path_train, path_valid=path_valid, path_snapshot=path_snapshot, path_pred=os.path.join(config.getpath("results"), basename + ".valid_pred.ctrees"), path_gold=os.path.join(config.getpath("results"), basename + ".valid_gold.ctrees")) elif actiontype == "evaluate": with chainer.using_config("train", False), chainer.no_backprop_mode(): # Test parse(model=model, decoder=decoder, dataset=test_dataset, path_pred=path_pred) scores = metrics.rst_parseval( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj", "test", "gold.labeled.nary.ctrees")) old_scores = metrics.old_rst_parseval( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj", "test", "gold.labeled.nary.ctrees")) out = { "Morey2018": { "Unlabeled Precision": scores["S"]["Precision"] * 100.0, "Precision_info": scores["S"]["Precision_info"], "Unlabeled Recall": scores["S"]["Recall"] * 100.0, "Recall_info": scores["S"]["Recall_info"], "Micro F1": scores["S"]["Micro F1"] * 100.0 }, "Marcu2000": { "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0, "Precision_info": old_scores["S"]["Precision_info"], "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0, "Recall_info": old_scores["S"]["Recall_info"], "Micro F1": old_scores["S"]["Micro F1"] * 100.0 } } utils.write_json(path_eval, out) utils.writelog(utils.pretty_format_dict(out)) utils.writelog("Done: %s" % basename)
def boston_housing(): """ Trains algorithms for the boston housing dataset """ # boston housing dataset is available in the sklearn library boston = load_boston() dimension = boston.data.shape[1] boston_data = np.column_stack((boston.data, boston.target)) split = 2 / 3 iterations = 20 gammas = np.linspace(math.pow(2, -40), math.pow(2, -26), 15) sigmas = np.linspace(math.pow(2, 7), math.pow(2, 13), 14) k_fold = 5 naive_results = np.zeros((iterations, 2)) single_results = np.zeros((iterations, dimension, 2)) all_results = np.zeros((iterations, 2)) k_results = np.zeros((iterations, 2)) for i in range(iterations): print("iterations i", i) training_set, testing_set = utils.split_dataset(boston_data, split) # naive _, train_mse, test_mse = lr.naive(training_set, testing_set) naive_results[i, 0] = train_mse naive_results[i, 1] = test_mse # single attribute for attr in range(dimension): _, train_mse, test_mse = lr.single_attribute(training_set, testing_set, attr) single_results[i, attr, 0] = train_mse single_results[i, attr, 1] = test_mse # all attributes _, train_mse, test_mse = lr.all_attributes(training_set, testing_set) all_results[i, 0] = train_mse all_results[i, 1] = test_mse # kernel _, train_mse, test_mse = rr.gaussian_kernel_cross_validation( training_set, testing_set, gammas, sigmas, k_fold) k_results[i, 0] = train_mse k_results[i, 1] = test_mse # display naive results print("Naive MSE train: %s +- %s" % ( np.mean(naive_results[:, 0]), np.std(naive_results[:, 0]))) print("Naive MSE test: %s +- %s" % ( np.mean(naive_results[:, 1]), np.std(naive_results[:, 1]))) # display single attribute results for attr in range(dimension): id = attr + 1 print("Linear (attribute %d) MSE train: %s +- %s" % ( id, np.mean(single_results[:, attr, 0]), np.std(single_results[:, attr, 0]))) print("Linear (attribute %d) MSE test: %s +- %s" % ( id, np.mean(single_results[:, attr, 1]), np.std(single_results[:, attr, 1]))) # display all attributes results print("Linear (all) MSE train: %s +- %s" % ( np.mean(all_results[:, 0]), np.std(all_results[:, 0]))) print("Linear (all) MSE test: %s +- %s" % ( np.mean(all_results[:, 1]), np.std(all_results[:, 1]))) # display kernel results print("Kernel MSE train: %s +- %s" % ( np.mean(k_results[:, 0]), np.std(k_results[:, 0]))) print("Kernel MSE test: %s +- %s" % ( np.mean(k_results[:, 1]), np.std(k_results[:, 1])))
self.__get_labels_neighborhood(row, distance) for row in X_classifier ] if __name__ == "__main__": PATH_FILE = ( "/home/nobrega/Dados/Documentos/Estudos/notes/dataset/knn_classification.csv" ) df = pd.read_csv(PATH_FILE) df.pop("id") y = df["class"].values y = y.reshape(len(y), 1) df.pop("class") X = df.values X_train, y_train, X_test, y_test = split_dataset(X, y, 0.7) n_neighbors = 15 knn = KNearestNeighbors(X_train, y_train, n_neighbors) for type_distance in DistanceTypes: try: y_hat = np.array(knn.predict(X_test, type_distance.value)) y_test = np.ndarray.flatten(np.array(y_test)) acc = sum(((y_hat == y_test) * 1.0)) / len(y_test) print(f"Type of Distance {type_distance.value} Acurácia: {acc}") except: print(f"This distance type {type_distance.value} can't calculate")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') keep_topK = 200 USE_HOLD_OUT = True # visualization of HOLD-OUT set # dir_prenms = "../results/preNMS" # =========================== Dataset ============================== # file path and make a list imgs_path = '../data/hw3_mycocodata_img_comp_zlib.h5' masks_path = '../data/hw3_mycocodata_mask_comp_zlib.h5' labels_path = "../data/hw3_mycocodata_labels_comp_zlib.npy" bboxes_path = "../data/hw3_mycocodata_bboxes_comp_zlib.npy" paths = [imgs_path, masks_path, labels_path, bboxes_path] dataset = BuildDataset(paths, augmentation=False) train_dataset, test_dataset = utils.split_dataset(dataset) # dataset # train_build_loader = BuildDataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) # train_loader = train_build_loader.loader() test_build_loader = BuildDataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0) test_loader = test_build_loader.loader() # we will need the ImageList from torchvision from torchvision.models.detection.image_list import ImageList do_eval(test_loader, checkpoint_file,
def main(): #torch.manual_seed(42) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--auto_lr', type=U.str2bool, default=False, help="Auto lr finder") parser.add_argument('--learning_rate', type=float, default=10e-4) parser.add_argument('--scheduler', type=U.str2bool, default=False) parser.add_argument('--wd', type=float, default=2e-4) parser.add_argument('--moment', type=float, default=0.9) parser.add_argument('--batch_size', default=5, type=int) parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--model', default='FCN', type=str, help="FCN or DLV3 model") parser.add_argument('--pretrained', default=False, type=U.str2bool, help="Use pretrained pytorch model") parser.add_argument('--eval_angle', default=True, type=U.str2bool,help=\ "If true, it'll eval the model with different angle input size") parser.add_argument('--rotate', default=False, type=U.str2bool, help="Use random rotation as data augmentation") parser.add_argument('--scale', default=True, type=U.str2bool, help="Use scale as data augmentation") parser.add_argument('--size_img', default=520, type=int, help="Size of input images") parser.add_argument('--size_crop', default=480, type=int, help="Size of crop image during training") parser.add_argument('--nw', default=0, type=int, help="Num workers for the data loader") parser.add_argument('--pm', default=True, type=U.str2bool, help="Pin memory for the dataloader") parser.add_argument('--gpu', default=0, type=int, help="Wich gpu to select for training") parser.add_argument('--benchmark', default=False, type=U.str2bool, help="enable or disable backends.cudnn") parser.add_argument('--split', default=False, type=U.str2bool, help="Split the dataset") parser.add_argument('--split_ratio', default=0.3, type=float, help="Amount of data we used for training") parser.add_argument('--dataroot_voc', default='/share/DEEPLEARNING/datasets/voc2012/', type=str) parser.add_argument('--dataroot_sbd', default='/share/DEEPLEARNING/datasets/sbd/', type=str) parser.add_argument('--model_name', type=str, help="what name to use for saving") parser.add_argument('--save_dir', default='/data/save_model', type=str) parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\ "If true it'll save the model every epoch in save_dir") parser.add_argument('--save_best', default=False, type=U.str2bool, help="If true will only save the best epoch model") args = parser.parse_args() # ------------ # save # ------------ save_dir = U.create_save_directory(args.save_dir) print('model will be saved in', save_dir) U.save_hparams(args, save_dir) # ------------ # device # ------------ device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") print("device used:", device) # ------------ # model # ------------ if args.model.upper() == 'FCN': model = models.segmentation.fcn_resnet101(pretrained=args.pretrained) elif args.model.upper() == 'DLV3': model = models.segmentation.deeplabv3_resnet101( pretrained=args.pretrained) else: raise Exception('model must be "FCN" or "DLV3"') model.to(device) # ------------ # data # ------------ if args.size_img < args.size_crop: raise Exception( 'Cannot have size of input images less than size of crop') size_img = (args.size_img, args.size_img) size_crop = (args.size_crop, args.size_crop) train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \ download=True,rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) val_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc, year='2012', image_set='val', download=True) train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\ rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) # Concatene dataset train_dataset = tud.ConcatDataset([train_dataset_VOC, train_dataset_SBD]) split = args.split if split == True: train_dataset = U.split_dataset(train_dataset, args.split_ratio) # Print len datasets print("There is", len(train_dataset), "images for training and", len(val_dataset_VOC), "for validation") dataloader_train = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,num_workers=args.nw,\ pin_memory=args.pm,shuffle=True,drop_last=True)#,collate_fn=U.my_collate) dataloader_val = torch.utils.data.DataLoader(val_dataset_VOC,num_workers=args.nw,pin_memory=args.pm,\ batch_size=args.batch_size) # Decide which device we want to run on # ------------ # training # ------------ # Auto lr finding #if args.auto_lr==True: criterion = nn.CrossEntropyLoss( ignore_index=21) # On ignore la classe border. optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.moment, weight_decay=args.wd) ev.train_fully_supervised(model=model,n_epochs=args.n_epochs,train_loader=dataloader_train,val_loader=dataloader_val,\ criterion=criterion,optimizer=optimizer,save_folder=save_dir,scheduler=args.scheduler,model_name=args.model_name,\ benchmark=args.benchmark, save_best=args.save_best,save_all_ep=args.save_all_ep,device=device,num_classes=21) # Final evaluation if args.eval_angle: d_iou = ev.eval_model_all_angle(model, args.size_img, args.dataroot_voc, train=True, device=device) U.save_eval_angle(d_iou, save_dir) d_iou = ev.eval_model_all_angle(model, args.size_img, args.dataroot_voc, train=False, device=device) U.save_eval_angle(d_iou, save_dir)
def train_dvna(A, P, verbose=False): n_epochs = 4000 train_ones_indices, train, val, test = u.split_dataset(A, seed=seed) A_train = u.prepare_train_matrix_dvne(A, train_ones_indices) # sample triplets # nbrs is a dict nbrs[i] -> {j1,j2,...,} nbrs = {} not_nbrs = {} all_nodes = {i for i in range(A.shape[0])} for ij in zip(train_ones_indices[0], train_ones_indices[1]): i, j = int(ij[0]), int(ij[1]) if i in nbrs.keys(): nbrs[i] = nbrs[i].union({j}) else: nbrs[i] = {j} if j in nbrs.keys(): nbrs[j] = nbrs[j].union({i}) else: nbrs[j] = {i} for i in nbrs.keys(): nbrs_set = nbrs[i] not_nbrs[i] = all_nodes - nbrs_set model = DVNE(n_features=A.shape[0]) model.to(device) opt = torch.optim.Adam(lr=0.001, params=model.parameters()) nonzero_ratio = A.sum() / (A.shape[0]**2) zero_ratio = 1 - nonzero_ratio A = A.to(device) P = P.to(device) A_train = A_train.to(device) model.n_samples = len(train) criterion = l.dvne_loss for e in range(n_epochs): t0 = time.time() triplets = u.sample_triplets(nbrs, not_nbrs, 300) i, j, k = triplets model.train() opt.zero_grad() out_i, mi, stdi = model.forward(A_train[i, :]) out_j, mj, stdj = model.forward(A_train[j, :]) out_k, mk, stdk = model.forward(A_train[k, :]) gt_i = P[i, :] gt_j = P[j, :] gt_k = P[k, :] out_reconstruction = torch.cat([out_i, out_j, out_k], dim=0).view(-1) gt = torch.cat([gt_i, gt_j, gt_k], dim=0).view(-1).to(device) a_gt = torch.cat([A[i, :], A[j, :], A[j, :]], dim=0).view(-1).to(device) x = torch.ones(gt.shape[0]).to(device) weights = torch.where(gt == 0.0, x * float(nonzero_ratio), x * float(zero_ratio)).to(device) loss_norm = weights.sum() / len(train) loss_weight = 0.6 # loss = criterion(out_reconstruction, gt_reconsturction, weight=weights) * loss_weight/ loss_norm l2 = criterion(gt, out_reconstruction) * loss_weight # loss = 0.0 w_ij = model.wasserstein((mi, stdi), (mj, stdj)) w_ik = model.wasserstein((mi, stdi), (mk, stdk)) l1 = l.energy_loss(w_ij, w_ik) loss = l1 + loss_weight * l2 loss.backward() opt.step() t1 = time.time() if verbose: if (e + 1) % 100 == 0: if len(val) > 0: with torch.no_grad(): val_loss = float( criterion( (model.forward(A_train)[0]).reshape(-1)[val], A.reshape(-1)[val].data)) val_auc = u.test_auc_dvna(model, A_train, A, val) else: val_auc = np.nan val_loss = np.nan print( "Iteration: {0}; train loss: {1:.4f}; val loss: {2:.4f}; val auc: {3:.4f}; time: {4:.4f}" .format(e + 1, loss, val_loss, val_auc, t1 - t0)) test_auc = u.test_auc_dvna(model, A_train, A, idx=test, test=True) # print("Test auc: ", test_auc) if dataset_name == 'cora' and visualize: with torch.no_grad(): encodings, mean, std = model.encode(P.to(device)) embeddings = torch.cat([mean, std], dim=1) dv.reduct_and_visualize(embeddings.cpu().numpy(), Y.argmax(axis=1)) train, val_test = next( Split(train_size=140, random_state=seed).split(embeddings, Y)) embeddings = embeddings.cpu() x_train, y_train = embeddings[train], Y[train] x_test, y_test = embeddings[val_test], Y[val_test] svm = SVC(C=10.0) svm.fit(x_train, y_train.argmax(axis=1)) y_predicted = svm.predict(x_test) print("SVM Accuracy: ", accuracy_score(y_predicted, y_test.argmax(axis=1))) return test_auc
def run(self, id, updated_fields, enable_service=False): share = self.datastore.get_by_id('shares', id) if not share: raise TaskException(errno.ENOENT, 'Share not found') if share['immutable']: raise TaskException(errno.EACCES, 'Cannot modify immutable share {0}.'.format(id)) if 'name' in updated_fields or 'type' in updated_fields: share.update(updated_fields) if self.datastore.exists( 'shares', ('id', '!=', id), ('type', '=', share['type']), ('name', '=', share['name']) ): raise TaskException(errno.EEXIST, 'Share {0} of type {1} already exists'.format( share['name'], share['type'] )) path_after_update = updated_fields.get('target_path', share['target_path']) type_after_update = updated_fields.get('target_type', share['target_type']) permissions = updated_fields.pop('permissions', None) share_path = self.dispatcher.call_sync('share.expand_path', path_after_update, type_after_update) if not os.path.exists(share_path): raise TaskException( errno.ENOENT, 'Selected share target {0} does not exist'.format(path_after_update) ) share = self.datastore.get_by_id('shares', id) remove_unchanged(updated_fields, share) path = self.dispatcher.call_sync('share.get_directory_path', share['id']) try: delete_config( path, '{0}-{1}'.format(share['type'], share['name']) ) except (OSError, ValueError): pass if 'type' in updated_fields: old_share_type = share['type'] new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type']) if share['target_type'] == 'DATASET': pool, dataset = split_dataset(share['target_path']) self.join_subtasks( self.run_subtask('volume.dataset.update', dataset, { 'permissions_type': new_share_type['perm_type'] }) ) share.update(updated_fields) self.run_subtask_sync('share.{0}.delete'.format(old_share_type), id) self.run_subtask_sync('share.{0}.create'.format(updated_fields['type']), share) else: self.run_subtask_sync('share.{0}.update'.format(share['type']), id, updated_fields) if permissions: path = self.dispatcher.call_sync('share.translate_path', id) self.run_subtask_sync('file.set_permissions', path, permissions) self.dispatcher.dispatch_event('share.changed', { 'operation': 'update', 'ids': [share['id']] }) updated_share = self.datastore.get_by_id('shares', id) path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id']) try: save_config( path, '{0}-{1}'.format(updated_share['type'], updated_share['name']), updated_share ) except OSError as err: self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err)))) service_state = self.dispatcher.call_sync('service.query', [('name', '=', share['type'])], {'single': True}) if service_state['state'] != 'RUNNING': if enable_service: config = service_state['config'] config['enable'] = True self.run_subtask_sync('service.update', service_state['id'], {'config': config}) else: self.add_warning(TaskWarning( errno.ENXIO, "Share has been updated but the service {0} is not currently running " "Please enable the {0} service.".format(share['type']) ))
def run(self, id, updated_fields, enable_service=False): share = self.datastore.get_by_id('shares', id) if not share: raise TaskException(errno.ENOENT, 'Share not found') if share['immutable']: raise TaskException(errno.EACCES, 'Cannot modify immutable share {0}.'.format(id)) if 'name' in updated_fields or 'type' in updated_fields: share.update(updated_fields) if self.datastore.exists( 'shares', ('id', '!=', id), ('type', '=', share['type']), ('name', '=', share['name']) ): raise TaskException(errno.EEXIST, 'Share {0} of type {1} already exists'.format( share['name'], share['type'] )) path_after_update = updated_fields.get('target_path', share['target_path']) type_after_update = updated_fields.get('target_type', share['target_type']) permissions = updated_fields.pop('permissions', None) share_path = self.dispatcher.call_sync('share.expand_path', path_after_update, type_after_update) if type_after_update in ('DIRECTORY', 'FILE'): pool_mountpoints = tuple(self.dispatcher.call_sync('volume.query', [], {'select': 'mountpoint'})) if not path_after_update.startswith(pool_mountpoints): raise TaskException(errno.EINVAL, "Provided directory or file has to reside within user defined ZFS pool") if not os.path.exists(share_path): raise TaskException( errno.ENOENT, 'Selected share target {0} does not exist'.format(path_after_update) ) share = self.datastore.get_by_id('shares', id) remove_unchanged(updated_fields, share) path = self.dispatcher.call_sync('share.get_directory_path', share['id']) try: delete_config( path, '{0}-{1}'.format(share['type'], share['name']) ) except (OSError, ValueError): pass if 'type' in updated_fields: old_share_type = share['type'] new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type']) if share['target_type'] == 'DATASET': pool, dataset = split_dataset(share['target_path']) self.join_subtasks( self.run_subtask('volume.dataset.update', dataset, { 'permissions_type': new_share_type['perm_type'] }) ) share.update(updated_fields) self.run_subtask_sync('share.{0}.delete'.format(old_share_type), id) self.run_subtask_sync('share.{0}.create'.format(updated_fields['type']), share) else: self.run_subtask_sync('share.{0}.update'.format(share['type']), id, updated_fields) if permissions: path = self.dispatcher.call_sync('share.translate_path', id) self.run_subtask_sync('file.set_permissions', path, permissions) self.dispatcher.dispatch_event('share.changed', { 'operation': 'update', 'ids': [share['id']] }) updated_share = self.datastore.get_by_id('shares', id) path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id']) try: save_config( path, '{0}-{1}'.format(updated_share['type'], updated_share['name']), updated_share, file_perms=0o600 ) except OSError as err: self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err)))) service_state = self.dispatcher.call_sync('service.query', [('name', '=', share['type'])], {'single': True}) if service_state['state'] != 'RUNNING': if enable_service: config = service_state['config'] config['enable'] = True self.run_subtask_sync('service.update', service_state['id'], {'config': config}) else: self.add_warning(TaskWarning( errno.ENXIO, "Share has been updated but the service {0} is not currently running " "Please enable the {0} service.".format(share['type']) ))
import utils from pylearn2.utils import serial import h5py import numpy as np import sys if __name__ == "__main__": conf_file = sys.argv[1] if len(sys.argv) > 1 else None conf = utils.get_config(conf_file) paths = utils.get_paths() region_size = conf['region_size'] region_stride = conf['region_stride'] train_rows, valid_rows, test_rows = utils.split_dataset( utils.get_filtered_rows(), conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) rowsdict = {'train': train_rows, 'valid': valid_rows, 'test': test_rows} nsamples = {} prefixes = ['s_', 'i_', 't_'] # Feature names' prefixes for subset, subrows in rowsdict.iteritems(): X = None y = [] feats = [] for row in subrows: samples = utils.get_samples_from_image( row, oversampling=(subset == 'train' and conf['oversampling'])) print "%i samples to %s taken from %s" % ( len(samples), subset, row['image_filename']) if len(samples) == 0:
from torchvision import transforms import os import argparse from utils import split_dataset, train, validate from grayscale import Grayscale from colorize import Colorize device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="path to input image") ap.add_argument("-e", "--epochs", type=int, default=100, help="# of epochs") args = vars(ap.parse_args()) # split the landscape dataset to train and validation folders split_dataset(args["image"]) # Training train_transforms = transforms.Compose( [transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip()]) train_folder = Grayscale('images/train', train_transforms) train_loader = torch.utils.data.DataLoader(train_folder, batch_size=64, shuffle=True) # Validation val_transforms = transforms.Compose( [transforms.Resize(256), transforms.CenterCrop(224)]) val_folder = Grayscale('images/val', val_transforms)
for i in range(0, len(rows), chunkSize): offset = min(i + chunkSize, len(rows)) f_chunk, y_chunk = fe_extraction.get_feats_from_cnn( rows[i:offset], model) if feats is None: feats = f_chunk y = y_chunk else: feats = np.vstack((feats, f_chunk)) y = np.hstack((y, y_chunk)) segm_ids = np.asarray([int(row['segmentation_id']) for row in rows]) features = np.hstack((features, feats)) train_rows, valid_rows, test_rows = utils.split_dataset( utils.get_filtered_rows(), conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) rows = train_rows + valid_rows patients = utils.rows_to_patients(rows) for i in range(n_runs): train_rows, empty_rows, valid_rows = utils.split_dataset( rows, valid_percent=0, test_percent=0.2, rng=rng, patients=patients) X_train, y_train = get_features(train_rows, features, segm_ids) X_valid, y_valid = get_features(valid_rows, features, segm_ids) print 'train: %i, valid: %i' % (X_train.shape[0], X_valid.shape[0]) if scale_feats: min_max_scaler = preprocessing.MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_valid = min_max_scaler.transform(X_valid)