def create_dataset(self):
     """
     Function responsible for creating the test, train and validation PyTorch Dataset class.
     """
     mapper_class = fm(self.preop_patients,
                       self.id_mapping,
                       normalized=self.normalized)
     dataset = mapper_class.generate_mapping()
     with open(self.filter_ids, 'rb') as file:
         filter_ids = pickle.load(file)
     dataset_filtered = [
         entry for entry in dataset if entry['ENT'] is not None
     ]
     self.dataset_filtered = [
         entry for entry in dataset_filtered
         if entry['id'] not in filter_ids
     ]
     random.seed(4)
     random.shuffle(self.dataset_filtered)
     train_dataset = Dataset(self.dataset_filtered,
                             phase='train',
                             normalize=self.normalization)
     val_dataset = Dataset(self.dataset_filtered,
                           phase='val',
                           normalize=self.normalization)
     test_dataset = dataset_2(self.dataset_filtered,
                              phase='test',
                              normalize=self.normalization)
     return train_dataset, val_dataset, test_dataset
示例#2
0
def main():
    """
    Main method for the script.
    """
    def mean_absolute_percentage_error(gt_y, pred_y):
        return 100 * np.mean(
            np.abs(np.divide(
                np.array(gt_y) - np.array(pred_y), np.array(gt_y))))

    def run_on_b_data():
        all_mapes = []
        num_folds = online_b_data.get_specific_workload(
            online_b_data.get_workload_ids()
            [0]).get_dataframe().values.shape[0]
        for fold_num in range(num_folds):
            b_primer, b_test, b_gt = split_online_b(online_b_data, fold_num)
            b_pred = []
            for curr_wl in tqdm(b_test.get_workload_ids()):
                S = make_s_matrix(curr_wl, offline_data, b_primer,
                                  'outputs/b_s_matrix_{}.npy'.format(fold_num))
                closest_observed_wl = find_closest_observed_wl(
                    curr_wl, offline_data, b_primer, S)
                model, ss_x, ss_y = train_concat_model(curr_wl, offline_data,
                                                       b_primer,
                                                       closest_observed_wl)
                b_pred.append(
                    max(eval_model(curr_wl, model, b_test, ss_x, ss_y), 0))
            print('Fold {}:\tMAPE = {:.2f}'.format(
                fold_num + 1, mean_absolute_percentage_error(b_gt, b_pred)))
            all_mapes.append(mean_absolute_percentage_error(b_gt, b_pred))
            both_arrays = np.array([b_gt, b_pred]).T
            np.savetxt("outputs/y_and_y_hat.csv", both_arrays, delimiter=",")
        print()
        print('MAPE:\t{:.2f}'.format(sum(all_mapes) / len(all_mapes)))

    def run_on_test_data():
        latencies = []
        nearest_neighbors = []
        for curr_wl in tqdm(test_data.get_workload_ids()):
            S = make_s_matrix(curr_wl, offline_data, online_c_data,
                              'outputs/s_matrix_test.npy')
            closest_observed_wl = find_closest_observed_wl(
                curr_wl, offline_data, online_c_data, S)
            model, ss_x, ss_y = train_concat_model(curr_wl, offline_data,
                                                   online_c_data,
                                                   closest_observed_wl)
            latencies.append(eval_model(curr_wl, model, test_data, ss_x, ss_y))
            nearest_neighbors.append(closest_observed_wl)
        output_test_data = test_data.get_dataframe()
        output_test_data['latency prediction'] = latencies
        output_test_data['nearest neighbor'] = nearest_neighbors
        output_test_data.to_csv('outputs/test_filled.csv', index=False)

    offline_data = Dataset(file_path=DATASET_PATHS['offline_workload'])
    online_b_data = Dataset(file_path=DATASET_PATHS['online_workload_B'])
    online_c_data = Dataset(file_path=DATASET_PATHS['online_workload_C'])
    test_data = Dataset(file_path=DATASET_PATHS['test'])

    run_on_b_data()
    run_on_test_data()
 def test_init(self):
     # construction from dataframe
     self.failureException(Dataset(pd.DataFrame(self.test_data)))
     # construction with legitimate non-dataframe
     self.failureException(Dataset(self.test_data))
     # construction with missing columns
     with self.failUnlessRaises(AttributeError):
         _ = Dataset(pd.DataFrame(self.test_data).drop(['x_min'], axis=1))
     # construction with invalid non-dataframe
     with self.failUnlessRaises(AttributeError):
         _ = Dataset((4, 5, 6))
     # calculate length of single dataset
     self.assertEqual(len(Dataset(pd.DataFrame(self.test_data))),
                      len(self.test_data))
     # add two Dataset objects and construct with list
     object_a = Dataset(pd.DataFrame(self.test_data))
     object_b = Dataset(pd.DataFrame(self.test_data))
     self.assertEqual(len(object_a + object_b), 2 * len(self.test_data))
     self.assertEqual(len(Dataset([object_a, object_b])),
                      2 * len(self.test_data))
     # read out of bounds
     with self.failUnlessRaises(IndexError):
         Dataset(pd.DataFrame(self.test_data)).read(3)
     # read() and generator functionality
     test_iterable = Dataset(pd.DataFrame(self.test_data))
     self.assertEqual([item for item in test_iterable],
                      [(item, item['image_id']) for item in self.test_data])
示例#4
0
def main():
    TEST_SOURCE_IMAGES = 0
    TEST_BATCH_IMAGES = 1

    config.load_all_images_to_ram = 0
    config.one_batch_overfit = 0
    dataset = Dataset()
    if TEST_SOURCE_IMAGES:
        for rects_image in dataset.images_data:
            rects_image.load()
            rects_image.mask = rects_image.draw_mask()
            images = [
                rects_image.image, rects_image.mask[:, :, 0],
                rects_image.mask[:, :, 1], rects_image.mask[:, :, 2]
            ]
            if not show_images(images):
                break
            rects_image.release()

    if TEST_BATCH_IMAGES:
        batch_shape = config.batch_shape
        batch_shape[0] = 1
        while 1:
            images_batch, masks_batch = dataset.get_batch()
            image = images_batch[0]
            mask = masks_batch[0]
            images = [image, mask[:, :, 0], mask[:, :, 1], mask[:, :, 2]]
            # print(mask[:, :, 0].max(), mask[:, :, 1].max(), mask[:, :, 2].max(), )
            if not show_images(images):
                break
示例#5
0
def train(modelname: str):
    ds = Dataset()
    emails = ds.get_data()
    md = Model()
    md.train(emails)
    md.serialize(modelname)
    return {"Hello": "World"}
def main():
    dataset = Dataset()
    fcn_model_module = loader.get_fcn_model_module()
    fcn_model = fcn_model_module.FCNModel()
    detector = FCNDetector(
        fcn_model.model, osp.join(fcn_model.weights_dir, 'best_weights.hdf5'))
    estimate_quality(detector, dataset)
def init(config_file):
    """Initialize the Setup script and do the validations."""
    setuplog.info('Initializing Setup')

    # Configuration
    setuplog.info('Reading configuration file')
    conf = ConfigFile(config_file)

    try:
        setuplog.info('Validating configuration file')
        conf.validate()
    except Exception as e:
        setuplog.error(e)
        sys.exit()

    setuplog.info('Configuration file OK')

    # Dataset
    setuplog.info('Loading Dataset')
    dataset = Dataset(dataset_path=conf.dataset_path)

    try:
        setuplog.info('Checking if receptor and ligands match suffix')
        dataset.check_input_files(receptor_suffix=conf.receptor_suffix,
                                  ligand_suffix=conf.ligand_suffix)
    except Exception as e:
        setuplog.error(e)
        sys.exit()

    setuplog.info('Input files are OK')

    # Haddock
    setuplog.info('Initializing HADDOCK Wrapper')
    haddock = HaddockWrapper(haddock_path=conf.haddock_path, py2=conf.py2_path)

    try:
        setuplog.info('Checking if HADDOCK is executable')
        haddock.check_if_executable()
    except Exception as e:
        setuplog.error(e)
        sys.exit()

    setuplog.info('HADDOCK execution OK')

    # All checks ok!
    setuplog.info('Initialization done!')
    return conf, dataset, haddock
def main():
    # only training GPRs for offline loads
    dataset = Dataset(file_path=DATASET_PATHS['offline_workload'])
    # load the pruned metric headers
    pruned_metrics = Dataset.load_pruned_metrics()
    # prune the dataset
    dataset = dataset.prune_columns(pruned_metrics + ['workload id'] +
                                    dataset.get_tuning_knob_headers())

    # build the GPRs
    start = time()
    gprs = WorkloadGPR(dataset=dataset)
    LOG.info(f"Finished building GPRs in {round(time() - start)} seconds.")

    # pickle 'em
    LOG.info("Pickling GPRs...")
    start = time()
    gprs.pickle_models()
    LOG.info(f"Finished pickling models in {round(time() - start)} seconds.")
示例#9
0
def main():
    dataset = Dataset()
    fcn_model = loader.get_fcn_model_module().FCNModel()
    trainer = Trainer()
    trainer.train(fcn_model, dataset)

    if not config.one_batch_overfit:
        detector = FCNDetector(fcn_model.model)
        detector.weights_path = osp.join(fcn_model.weights_dir, 'best_weights.hdf5')
        estimate_quality(detector, dataset)
示例#10
0
def main():
    preop_patients = []
    for path in Path('./data/preoperative').rglob('BMIAXNA*'):
        preop_patients.append(path)
    mapper_class = fm(preop_patients, './data/pickles_jsons/id_surv_mapping.json')
    dataset = mapper_class.generate_mapping()
    dataset_filtered = [entry for entry in dataset if entry['ENT'] is not None]
    train_dataset = Dataset(dataset_filtered, phase='train')
    val_dataset = Dataset(dataset_filtered, phase='val')
    filter_ids = []
    for data in train_dataset:
        if 'BMIAXNAT' in data:
            filter_ids.append(data)

    for data in val_dataset:
        if 'BMIAXNAT' in data:
            filter_ids.append(data)

    with open('./data/pickles_jsons/filter_ids.pkl', 'wb') as file:
        pickle.dump(filter_ids, file)
示例#11
0
    def _build_models_from_dataset(self, dataset: Dataset, scaler=None):
        """
        Build all of the GPR models from scratch
        """
        df = dataset.get_dataframe()
        metrics = dataset.get_metric_headers()
        workload_ids = dataset.get_workload_ids()
        knob_headers = dataset.get_tuning_knob_headers()
        total_gprs = len(workload_ids) * len(metrics)

        with tqdm(total=total_gprs) as pbar:
            for w in workload_ids:
                workloads = df[df['workload id'] == w]
                for m in metrics:
                    X = workloads[knob_headers].values

                    if scaler is not None:
                        X = scaler.transform(X)

                    y = workloads[m].values
                    m_file_name = m \
                        .replace('_', '-') \
                        .replace('/', '-') \
                        .replace('%', '-')

                    # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter
                    restarts = 10
                    # sigma_f, l
                    kernel = ConstantKernel(10.0) * RBF(y.std())
                    # sigma_y
                    alpha = 0.1
                    model = GaussianProcessRegressor(
                        kernel=kernel,
                        n_restarts_optimizer=restarts,
                        alpha=alpha,
                        normalize_y=True)
                    model.fit(X, y)
                    self.models[f"wl_{w}_{m_file_name}.pickle"] = model
                    pbar.update(1)
示例#12
0
def split_online_b(online_b_data, test_idx):
    primer = pd.DataFrame(columns=online_b_data.get_dataframe().columns)
    eval = pd.DataFrame(columns=online_b_data.get_dataframe().columns)
    for wl_id in online_b_data.get_workload_ids():
        curr_ds = online_b_data.get_specific_workload(wl_id)
        for idx in range(curr_ds.get_dataframe().values.shape[0]):
            if idx == test_idx:
                eval = eval.append(curr_ds.get_dataframe().iloc[idx:idx + 1],
                                   ignore_index=True)
            else:
                primer = primer.append(curr_ds.get_dataframe().iloc[idx:idx +
                                                                    1],
                                       ignore_index=True)
    primer = Dataset(dataframe=primer)
    eval = Dataset(dataframe=eval)
    latency_gt = eval.get_column_values('latency')
    eval = eval.prune_columns(['workload id'] + eval.get_tuning_knob_headers())

    return primer, eval, latency_gt
示例#13
0
    ********************************************************************
    '''
    ext_name = 'sstask_%d_ld_%.02f_lg_%.02f_batch_%d_niters_%d' \
                   % (ss_task, lambda_d, lambda_g , batch_size, n_steps)

    #output dir
    out_dir = os.path.join(out_dir, db_name + '_' + model + '_'     \
                                                  + nnet_type + '_' \
                                                  + loss_type + '_' \
                                                  + ext_name, db_name)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # setup dataset
    dataset = Dataset(name=db_name, source=data_source, \
                                                  batch_size=batch_size)

    # setup gan model and train
    msdistgan = MSDistGAN(model=model,               \
                              is_train  = is_train,  \
                              ss_task   = ss_task,   \
                              loss_type = loss_type, \
                              lambda_p  = lambda_p,  \
                              lambda_r  = lambda_r,  \
                              lambda_w  = lambda_w,  \
                              lambda_d  = lambda_d,  \
                              lambda_g  = lambda_g,  \
                              noise_dim = noise_dim, \
                              lr    = lr,            \
                              beta1 = beta1,         \
                              beta2 = beta2,         \
 def make_dataset(self, include_invalid_images=False):
     source_dataframe = self._dataframe.drop('image_valid', axis=1) if include_invalid_images else\
         self._dataframe[self._dataframe.image_valid].drop('image_valid', axis=1)
     dataset = Dataset(source_object=source_dataframe,
                       image_read_method=read_image_file)
     return dataset
示例#15
0
def main():
    LOG.debug('Clearing out all of the workload models.')
    clear_wl_models()

    dataset = Dataset(file_path=DATASET_PATHS['offline_workload'])
    pruned_metrics = Dataset.load_pruned_metrics()
    pruned_dataset = dataset.prune_columns(pruned_metrics + ['workload id'] +
                                           dataset.get_tuning_knob_headers())
    df = pruned_dataset.get_dataframe()

    # pick the ith data to use as validation
    i = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    workload_ids = pruned_dataset.get_workload_ids()
    validation_df = pd.concat(
        [df[df['workload id'] == wid].iloc[i] for wid in workload_ids])
    validation_idx = validation_df.index
    valid_dataset = Dataset(dataframe=validation_df)

    diff_idx = df.index.difference(validation_df.index)

    train_df = df.iloc[diff_idx]
    train_dataset = Dataset(dataframe=train_df)

    #  LOG.info("Fitting input scaler...")
    #  scaler = StandardScaler()
    #  scaler.fit(train_df[dataset.get_tuning_knob_headers()].values)
    scaler = None

    LOG.info("Training workload GPRs...")
    gprs = WorkloadGPR(dataset=train_dataset, scaler=scaler)

    LOG.info("Validating GPRs...")
    train = {}
    result = {}
    for pm in pruned_metrics:
        for wid in workload_ids:
            name = f"{pm}|{wid}"
            model = gprs.get_model(wid, pm)

            # train
            #  X = train_df[dataset.get_tuning_knob_headers()].values
            #  X = scaler.transform(X)
            #  y = train_df[pm].values
            #  y_hat = model.predict(X)
            #  mape = np.mean(np.abs((y - y_hat) / y)) * 100
            #  train[name] = mape

            # validation
            X = validation_df[dataset.get_tuning_knob_headers()].values
            if scaler is not None:
                X = scaler.transform(X)

            y = validation_df[pm].values
            y_hat = model.predict(X)
            mape = np.mean(np.abs((y - y_hat) / y)) * 100
            result[name] = mape
            #  LOG.info('%s: %s', name, mape)

    #  LOG.info('Training average MAPE: %s',
    #  np.array(list(train.values())).mean())
    LOG.info('Validation average MAPE: %s',
             np.array(list(result.values())).mean())
示例#16
0
def main():
    """
    Main method for the script.
    """
    dataset = Dataset(file_path=DATASET_PATHS[CONFIG.dataset])
    df = dataset.get_dataframe()

    # remove columns that are constant values
    metric_headers = dataset.get_metric_headers()
    constant_headers = []
    variable_headers = []
    for header in metric_headers:
        if np.unique(df[header].values).size > 1:
            variable_headers.append(header)
        else:
            constant_headers.append(header)

    metric_headers = variable_headers
    dataset = Dataset(dataframe=df.drop(constant_headers, axis=1))
    raw_metrics = dataset.get_metrics()
    metrics = raw_metrics.T

    # factor analysis
    LOG.info('Starting factor analysis with %s factors...', CONFIG.num_factors)
    start = time()
    # model = FactorAnalysis(n_components=CONFIG.num_factors)
    # factors = model.fit_transform(metrics)  # num_metrics * num_factors
    rng = np.random.RandomState(74)
    model = GaussianRandomProjection(eps=0.999, random_state=rng)
    factors = model.fit_transform(metrics)
    LOG.debug('Dimension before factor analysis: %s', metrics.shape)
    LOG.debug('Dimension after factor analysis: %s', factors.shape)
    LOG.info('Finished factor analysis in %s seconds.', round(time() - start))

    # clustering
    if CONFIG.model == 'kmeans':
        model = build_k_means(factors)
    elif CONFIG.model == 'kmedoids':
        model = build_k_medoids(factors)
    else:
        raise ValueError('Unrecognized model: %s', CONFIG.model)

    # find cluster center
    labels = model.labels_
    # each dimension in transformed_data is the distance to the cluster
    # centers.
    transformed_data = model.transform(factors)
    leftover_metrics = []
    for i in np.unique(labels):
        # index of the points for the ith cluster
        cluster_member_idx = np.argwhere(labels == i).squeeze(1)
        cluster_members = transformed_data[cluster_member_idx]
        # find the index of the minimum-distance point to the center
        closest_member = cluster_member_idx[np.argmin(cluster_members[:, i])]
        leftover_metrics.append(metric_headers[closest_member])

    # latency needs to be in the metrics
    if 'latency' not in leftover_metrics:
        leftover_metrics += ['latency']

    with open(CONFIG.output_path, 'w') as file:
        file.writelines('\n'.join(leftover_metrics))
示例#17
0
    n_steps = 100000  #number of iterations

    lambda_p = 1.0
    lambda_r = 1.0

    # [Impotant]
    # lambda_w = sqrt(d/D) as in the paper, if you change the network
    #  architecture: (d: data noise dim, D: feature dim)
    lambda_w = np.sqrt(noise_dim * 1.0 / feature_dim)

    # output dir
    out_dir = os.path.join(out_dir, model, db_name)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # setup dataset
    dataset = Dataset(name=db_name, source=data_source)
    # setup gan model and train
    distgan = DISTGAN(model=model, \
                              loss_type = loss_type, \
                              lambda_p=lambda_p, lambda_r=lambda_r, \
                              lambda_w=lambda_w, \
                              noise_dim = noise_dim, \
                              dataset=dataset, \
                              n_steps = n_steps, \
                              out_dir=out_dir)
    if is_train == True:
        distgan.train()
    else:
        distgan.generate()
示例#18
0
                            '-l',
                            help='Setup tensorboard log directory')
    arg_parser.add_argument('--encode', '-e', default = 'identity', help = \
                            'Setup encodeing function. default run without encode\n'
                            'Option: identity onehot21 onehot22'
                            )

    args = arg_parser.parse_args()
    if args.logdir == None:
        print('Need a logdir. See python3 train.py --help')
        exit()
    elif os.path.isdir(args.logdir) and not args.force:
        print('logdir is exist')
        exit()

    data = Dataset(
        encode = args.encode, \
        trainfiles = ['data/baseline/train.tfrecord'], \
        testfiles = ['data/baseline/test.tfrecord'], \
        parallel_call = 4 \
    )

    model = MusiteDeepModel(dataset=data, logdir=args.logdir)

    start = datetime.datetime.now()
    model.train(epochs=500)
    end = datetime.datetime.now()
    print(end - start)

    model.close()
示例#19
0
if debug:
    
    debug = Debug(p_moduli, coeff_mod, precision, "model15", verbosity)
    
    if debug_plain:
        debug.test_plain_net()
    
    if debug_encoded:
        debug.test_encoded_net(0)
    
    if debug_encrypted:
        debug.test_encrypted_net(1)
        
else:

    ds = Dataset(verbosity = verbosity)
    (train, train_labels), (test, test_labels) = ds.load(2)
    
    exp = Exporter(verbosity = verbosity)
    # exp.exportBestOf(train, train_labels, test, test_labels, params, model_name="model15", num_test=10)
    
    model = exp.load(model_name='model15')
    
    test = test[:coeff_mod]
    test_labels = test_labels[:coeff_mod]
    
    cn = Cryptonet(test, test_labels, model, p_moduli, coeff_mod, precision, True)
    cn.evaluate()
    
    m = Model()
    acc = m.getAccuracy(model, test, test_labels)
        #ext_name = 'C_%d_eps_%f_delta_%f_dp_%d_lr_%f_dups10_softmax_categorical_%d_cervical_cancer2' % (C, eps, delta, n_steps, lr, categorical_softmax_use)

    #output dir
    out_dir = os.path.join(out_dir, db_name + '_' + model + '_' \
                                          + nnet_type + '_' \
                                          + loss_type + '_' \
                                          + ext_name, db_name)

    #out_dir = os.path.join(out_dir, ext_name)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # setup dataset
    dataset = Dataset(name=db_name,
                      source=data_source,
                      categorical_softmax_use=categorical_softmax_use,
                      batch_size=64)

    # setup gan model and train
    distgan = DISTGAN(model=model, \
                              is_train = is_train, \
                              loss_type = loss_type, \
                              lambda_p=lambda_p, \
                              lambda_r=lambda_r, \
                              lambda_w=lambda_w, \
                              lambda_d=lambda_d, \
                              lambda_g=lambda_g, \
                              C   = C, \
                              eps = eps, \
                              delta = delta, \
                              regc = regc,\