def create_dataset(self): """ Function responsible for creating the test, train and validation PyTorch Dataset class. """ mapper_class = fm(self.preop_patients, self.id_mapping, normalized=self.normalized) dataset = mapper_class.generate_mapping() with open(self.filter_ids, 'rb') as file: filter_ids = pickle.load(file) dataset_filtered = [ entry for entry in dataset if entry['ENT'] is not None ] self.dataset_filtered = [ entry for entry in dataset_filtered if entry['id'] not in filter_ids ] random.seed(4) random.shuffle(self.dataset_filtered) train_dataset = Dataset(self.dataset_filtered, phase='train', normalize=self.normalization) val_dataset = Dataset(self.dataset_filtered, phase='val', normalize=self.normalization) test_dataset = dataset_2(self.dataset_filtered, phase='test', normalize=self.normalization) return train_dataset, val_dataset, test_dataset
def main(): """ Main method for the script. """ def mean_absolute_percentage_error(gt_y, pred_y): return 100 * np.mean( np.abs(np.divide( np.array(gt_y) - np.array(pred_y), np.array(gt_y)))) def run_on_b_data(): all_mapes = [] num_folds = online_b_data.get_specific_workload( online_b_data.get_workload_ids() [0]).get_dataframe().values.shape[0] for fold_num in range(num_folds): b_primer, b_test, b_gt = split_online_b(online_b_data, fold_num) b_pred = [] for curr_wl in tqdm(b_test.get_workload_ids()): S = make_s_matrix(curr_wl, offline_data, b_primer, 'outputs/b_s_matrix_{}.npy'.format(fold_num)) closest_observed_wl = find_closest_observed_wl( curr_wl, offline_data, b_primer, S) model, ss_x, ss_y = train_concat_model(curr_wl, offline_data, b_primer, closest_observed_wl) b_pred.append( max(eval_model(curr_wl, model, b_test, ss_x, ss_y), 0)) print('Fold {}:\tMAPE = {:.2f}'.format( fold_num + 1, mean_absolute_percentage_error(b_gt, b_pred))) all_mapes.append(mean_absolute_percentage_error(b_gt, b_pred)) both_arrays = np.array([b_gt, b_pred]).T np.savetxt("outputs/y_and_y_hat.csv", both_arrays, delimiter=",") print() print('MAPE:\t{:.2f}'.format(sum(all_mapes) / len(all_mapes))) def run_on_test_data(): latencies = [] nearest_neighbors = [] for curr_wl in tqdm(test_data.get_workload_ids()): S = make_s_matrix(curr_wl, offline_data, online_c_data, 'outputs/s_matrix_test.npy') closest_observed_wl = find_closest_observed_wl( curr_wl, offline_data, online_c_data, S) model, ss_x, ss_y = train_concat_model(curr_wl, offline_data, online_c_data, closest_observed_wl) latencies.append(eval_model(curr_wl, model, test_data, ss_x, ss_y)) nearest_neighbors.append(closest_observed_wl) output_test_data = test_data.get_dataframe() output_test_data['latency prediction'] = latencies output_test_data['nearest neighbor'] = nearest_neighbors output_test_data.to_csv('outputs/test_filled.csv', index=False) offline_data = Dataset(file_path=DATASET_PATHS['offline_workload']) online_b_data = Dataset(file_path=DATASET_PATHS['online_workload_B']) online_c_data = Dataset(file_path=DATASET_PATHS['online_workload_C']) test_data = Dataset(file_path=DATASET_PATHS['test']) run_on_b_data() run_on_test_data()
def test_init(self): # construction from dataframe self.failureException(Dataset(pd.DataFrame(self.test_data))) # construction with legitimate non-dataframe self.failureException(Dataset(self.test_data)) # construction with missing columns with self.failUnlessRaises(AttributeError): _ = Dataset(pd.DataFrame(self.test_data).drop(['x_min'], axis=1)) # construction with invalid non-dataframe with self.failUnlessRaises(AttributeError): _ = Dataset((4, 5, 6)) # calculate length of single dataset self.assertEqual(len(Dataset(pd.DataFrame(self.test_data))), len(self.test_data)) # add two Dataset objects and construct with list object_a = Dataset(pd.DataFrame(self.test_data)) object_b = Dataset(pd.DataFrame(self.test_data)) self.assertEqual(len(object_a + object_b), 2 * len(self.test_data)) self.assertEqual(len(Dataset([object_a, object_b])), 2 * len(self.test_data)) # read out of bounds with self.failUnlessRaises(IndexError): Dataset(pd.DataFrame(self.test_data)).read(3) # read() and generator functionality test_iterable = Dataset(pd.DataFrame(self.test_data)) self.assertEqual([item for item in test_iterable], [(item, item['image_id']) for item in self.test_data])
def main(): TEST_SOURCE_IMAGES = 0 TEST_BATCH_IMAGES = 1 config.load_all_images_to_ram = 0 config.one_batch_overfit = 0 dataset = Dataset() if TEST_SOURCE_IMAGES: for rects_image in dataset.images_data: rects_image.load() rects_image.mask = rects_image.draw_mask() images = [ rects_image.image, rects_image.mask[:, :, 0], rects_image.mask[:, :, 1], rects_image.mask[:, :, 2] ] if not show_images(images): break rects_image.release() if TEST_BATCH_IMAGES: batch_shape = config.batch_shape batch_shape[0] = 1 while 1: images_batch, masks_batch = dataset.get_batch() image = images_batch[0] mask = masks_batch[0] images = [image, mask[:, :, 0], mask[:, :, 1], mask[:, :, 2]] # print(mask[:, :, 0].max(), mask[:, :, 1].max(), mask[:, :, 2].max(), ) if not show_images(images): break
def train(modelname: str): ds = Dataset() emails = ds.get_data() md = Model() md.train(emails) md.serialize(modelname) return {"Hello": "World"}
def main(): dataset = Dataset() fcn_model_module = loader.get_fcn_model_module() fcn_model = fcn_model_module.FCNModel() detector = FCNDetector( fcn_model.model, osp.join(fcn_model.weights_dir, 'best_weights.hdf5')) estimate_quality(detector, dataset)
def init(config_file): """Initialize the Setup script and do the validations.""" setuplog.info('Initializing Setup') # Configuration setuplog.info('Reading configuration file') conf = ConfigFile(config_file) try: setuplog.info('Validating configuration file') conf.validate() except Exception as e: setuplog.error(e) sys.exit() setuplog.info('Configuration file OK') # Dataset setuplog.info('Loading Dataset') dataset = Dataset(dataset_path=conf.dataset_path) try: setuplog.info('Checking if receptor and ligands match suffix') dataset.check_input_files(receptor_suffix=conf.receptor_suffix, ligand_suffix=conf.ligand_suffix) except Exception as e: setuplog.error(e) sys.exit() setuplog.info('Input files are OK') # Haddock setuplog.info('Initializing HADDOCK Wrapper') haddock = HaddockWrapper(haddock_path=conf.haddock_path, py2=conf.py2_path) try: setuplog.info('Checking if HADDOCK is executable') haddock.check_if_executable() except Exception as e: setuplog.error(e) sys.exit() setuplog.info('HADDOCK execution OK') # All checks ok! setuplog.info('Initialization done!') return conf, dataset, haddock
def main(): # only training GPRs for offline loads dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) # load the pruned metric headers pruned_metrics = Dataset.load_pruned_metrics() # prune the dataset dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) # build the GPRs start = time() gprs = WorkloadGPR(dataset=dataset) LOG.info(f"Finished building GPRs in {round(time() - start)} seconds.") # pickle 'em LOG.info("Pickling GPRs...") start = time() gprs.pickle_models() LOG.info(f"Finished pickling models in {round(time() - start)} seconds.")
def main(): dataset = Dataset() fcn_model = loader.get_fcn_model_module().FCNModel() trainer = Trainer() trainer.train(fcn_model, dataset) if not config.one_batch_overfit: detector = FCNDetector(fcn_model.model) detector.weights_path = osp.join(fcn_model.weights_dir, 'best_weights.hdf5') estimate_quality(detector, dataset)
def main(): preop_patients = [] for path in Path('./data/preoperative').rglob('BMIAXNA*'): preop_patients.append(path) mapper_class = fm(preop_patients, './data/pickles_jsons/id_surv_mapping.json') dataset = mapper_class.generate_mapping() dataset_filtered = [entry for entry in dataset if entry['ENT'] is not None] train_dataset = Dataset(dataset_filtered, phase='train') val_dataset = Dataset(dataset_filtered, phase='val') filter_ids = [] for data in train_dataset: if 'BMIAXNAT' in data: filter_ids.append(data) for data in val_dataset: if 'BMIAXNAT' in data: filter_ids.append(data) with open('./data/pickles_jsons/filter_ids.pkl', 'wb') as file: pickle.dump(filter_ids, file)
def _build_models_from_dataset(self, dataset: Dataset, scaler=None): """ Build all of the GPR models from scratch """ df = dataset.get_dataframe() metrics = dataset.get_metric_headers() workload_ids = dataset.get_workload_ids() knob_headers = dataset.get_tuning_knob_headers() total_gprs = len(workload_ids) * len(metrics) with tqdm(total=total_gprs) as pbar: for w in workload_ids: workloads = df[df['workload id'] == w] for m in metrics: X = workloads[knob_headers].values if scaler is not None: X = scaler.transform(X) y = workloads[m].values m_file_name = m \ .replace('_', '-') \ .replace('/', '-') \ .replace('%', '-') # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter restarts = 10 # sigma_f, l kernel = ConstantKernel(10.0) * RBF(y.std()) # sigma_y alpha = 0.1 model = GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=restarts, alpha=alpha, normalize_y=True) model.fit(X, y) self.models[f"wl_{w}_{m_file_name}.pickle"] = model pbar.update(1)
def split_online_b(online_b_data, test_idx): primer = pd.DataFrame(columns=online_b_data.get_dataframe().columns) eval = pd.DataFrame(columns=online_b_data.get_dataframe().columns) for wl_id in online_b_data.get_workload_ids(): curr_ds = online_b_data.get_specific_workload(wl_id) for idx in range(curr_ds.get_dataframe().values.shape[0]): if idx == test_idx: eval = eval.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) else: primer = primer.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) primer = Dataset(dataframe=primer) eval = Dataset(dataframe=eval) latency_gt = eval.get_column_values('latency') eval = eval.prune_columns(['workload id'] + eval.get_tuning_knob_headers()) return primer, eval, latency_gt
******************************************************************** ''' ext_name = 'sstask_%d_ld_%.02f_lg_%.02f_batch_%d_niters_%d' \ % (ss_task, lambda_d, lambda_g , batch_size, n_steps) #output dir out_dir = os.path.join(out_dir, db_name + '_' + model + '_' \ + nnet_type + '_' \ + loss_type + '_' \ + ext_name, db_name) if not os.path.exists(out_dir): os.makedirs(out_dir) # setup dataset dataset = Dataset(name=db_name, source=data_source, \ batch_size=batch_size) # setup gan model and train msdistgan = MSDistGAN(model=model, \ is_train = is_train, \ ss_task = ss_task, \ loss_type = loss_type, \ lambda_p = lambda_p, \ lambda_r = lambda_r, \ lambda_w = lambda_w, \ lambda_d = lambda_d, \ lambda_g = lambda_g, \ noise_dim = noise_dim, \ lr = lr, \ beta1 = beta1, \ beta2 = beta2, \
def make_dataset(self, include_invalid_images=False): source_dataframe = self._dataframe.drop('image_valid', axis=1) if include_invalid_images else\ self._dataframe[self._dataframe.image_valid].drop('image_valid', axis=1) dataset = Dataset(source_object=source_dataframe, image_read_method=read_image_file) return dataset
def main(): LOG.debug('Clearing out all of the workload models.') clear_wl_models() dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) pruned_metrics = Dataset.load_pruned_metrics() pruned_dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) df = pruned_dataset.get_dataframe() # pick the ith data to use as validation i = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] workload_ids = pruned_dataset.get_workload_ids() validation_df = pd.concat( [df[df['workload id'] == wid].iloc[i] for wid in workload_ids]) validation_idx = validation_df.index valid_dataset = Dataset(dataframe=validation_df) diff_idx = df.index.difference(validation_df.index) train_df = df.iloc[diff_idx] train_dataset = Dataset(dataframe=train_df) # LOG.info("Fitting input scaler...") # scaler = StandardScaler() # scaler.fit(train_df[dataset.get_tuning_knob_headers()].values) scaler = None LOG.info("Training workload GPRs...") gprs = WorkloadGPR(dataset=train_dataset, scaler=scaler) LOG.info("Validating GPRs...") train = {} result = {} for pm in pruned_metrics: for wid in workload_ids: name = f"{pm}|{wid}" model = gprs.get_model(wid, pm) # train # X = train_df[dataset.get_tuning_knob_headers()].values # X = scaler.transform(X) # y = train_df[pm].values # y_hat = model.predict(X) # mape = np.mean(np.abs((y - y_hat) / y)) * 100 # train[name] = mape # validation X = validation_df[dataset.get_tuning_knob_headers()].values if scaler is not None: X = scaler.transform(X) y = validation_df[pm].values y_hat = model.predict(X) mape = np.mean(np.abs((y - y_hat) / y)) * 100 result[name] = mape # LOG.info('%s: %s', name, mape) # LOG.info('Training average MAPE: %s', # np.array(list(train.values())).mean()) LOG.info('Validation average MAPE: %s', np.array(list(result.values())).mean())
def main(): """ Main method for the script. """ dataset = Dataset(file_path=DATASET_PATHS[CONFIG.dataset]) df = dataset.get_dataframe() # remove columns that are constant values metric_headers = dataset.get_metric_headers() constant_headers = [] variable_headers = [] for header in metric_headers: if np.unique(df[header].values).size > 1: variable_headers.append(header) else: constant_headers.append(header) metric_headers = variable_headers dataset = Dataset(dataframe=df.drop(constant_headers, axis=1)) raw_metrics = dataset.get_metrics() metrics = raw_metrics.T # factor analysis LOG.info('Starting factor analysis with %s factors...', CONFIG.num_factors) start = time() # model = FactorAnalysis(n_components=CONFIG.num_factors) # factors = model.fit_transform(metrics) # num_metrics * num_factors rng = np.random.RandomState(74) model = GaussianRandomProjection(eps=0.999, random_state=rng) factors = model.fit_transform(metrics) LOG.debug('Dimension before factor analysis: %s', metrics.shape) LOG.debug('Dimension after factor analysis: %s', factors.shape) LOG.info('Finished factor analysis in %s seconds.', round(time() - start)) # clustering if CONFIG.model == 'kmeans': model = build_k_means(factors) elif CONFIG.model == 'kmedoids': model = build_k_medoids(factors) else: raise ValueError('Unrecognized model: %s', CONFIG.model) # find cluster center labels = model.labels_ # each dimension in transformed_data is the distance to the cluster # centers. transformed_data = model.transform(factors) leftover_metrics = [] for i in np.unique(labels): # index of the points for the ith cluster cluster_member_idx = np.argwhere(labels == i).squeeze(1) cluster_members = transformed_data[cluster_member_idx] # find the index of the minimum-distance point to the center closest_member = cluster_member_idx[np.argmin(cluster_members[:, i])] leftover_metrics.append(metric_headers[closest_member]) # latency needs to be in the metrics if 'latency' not in leftover_metrics: leftover_metrics += ['latency'] with open(CONFIG.output_path, 'w') as file: file.writelines('\n'.join(leftover_metrics))
n_steps = 100000 #number of iterations lambda_p = 1.0 lambda_r = 1.0 # [Impotant] # lambda_w = sqrt(d/D) as in the paper, if you change the network # architecture: (d: data noise dim, D: feature dim) lambda_w = np.sqrt(noise_dim * 1.0 / feature_dim) # output dir out_dir = os.path.join(out_dir, model, db_name) if not os.path.exists(out_dir): os.makedirs(out_dir) # setup dataset dataset = Dataset(name=db_name, source=data_source) # setup gan model and train distgan = DISTGAN(model=model, \ loss_type = loss_type, \ lambda_p=lambda_p, lambda_r=lambda_r, \ lambda_w=lambda_w, \ noise_dim = noise_dim, \ dataset=dataset, \ n_steps = n_steps, \ out_dir=out_dir) if is_train == True: distgan.train() else: distgan.generate()
'-l', help='Setup tensorboard log directory') arg_parser.add_argument('--encode', '-e', default = 'identity', help = \ 'Setup encodeing function. default run without encode\n' 'Option: identity onehot21 onehot22' ) args = arg_parser.parse_args() if args.logdir == None: print('Need a logdir. See python3 train.py --help') exit() elif os.path.isdir(args.logdir) and not args.force: print('logdir is exist') exit() data = Dataset( encode = args.encode, \ trainfiles = ['data/baseline/train.tfrecord'], \ testfiles = ['data/baseline/test.tfrecord'], \ parallel_call = 4 \ ) model = MusiteDeepModel(dataset=data, logdir=args.logdir) start = datetime.datetime.now() model.train(epochs=500) end = datetime.datetime.now() print(end - start) model.close()
if debug: debug = Debug(p_moduli, coeff_mod, precision, "model15", verbosity) if debug_plain: debug.test_plain_net() if debug_encoded: debug.test_encoded_net(0) if debug_encrypted: debug.test_encrypted_net(1) else: ds = Dataset(verbosity = verbosity) (train, train_labels), (test, test_labels) = ds.load(2) exp = Exporter(verbosity = verbosity) # exp.exportBestOf(train, train_labels, test, test_labels, params, model_name="model15", num_test=10) model = exp.load(model_name='model15') test = test[:coeff_mod] test_labels = test_labels[:coeff_mod] cn = Cryptonet(test, test_labels, model, p_moduli, coeff_mod, precision, True) cn.evaluate() m = Model() acc = m.getAccuracy(model, test, test_labels)
#ext_name = 'C_%d_eps_%f_delta_%f_dp_%d_lr_%f_dups10_softmax_categorical_%d_cervical_cancer2' % (C, eps, delta, n_steps, lr, categorical_softmax_use) #output dir out_dir = os.path.join(out_dir, db_name + '_' + model + '_' \ + nnet_type + '_' \ + loss_type + '_' \ + ext_name, db_name) #out_dir = os.path.join(out_dir, ext_name) if not os.path.exists(out_dir): os.makedirs(out_dir) # setup dataset dataset = Dataset(name=db_name, source=data_source, categorical_softmax_use=categorical_softmax_use, batch_size=64) # setup gan model and train distgan = DISTGAN(model=model, \ is_train = is_train, \ loss_type = loss_type, \ lambda_p=lambda_p, \ lambda_r=lambda_r, \ lambda_w=lambda_w, \ lambda_d=lambda_d, \ lambda_g=lambda_g, \ C = C, \ eps = eps, \ delta = delta, \ regc = regc,\