예제 #1
0
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, 
                us=[0], name='pendulum-gym-image-dataset.pkl', **kwargs):
    data = {}

    assert save_dir is not None
    # path = '{}/pendulum-small-angle-image-dataset.pkl'.format(save_dir)
    path = os.path.join(save_dir, name)
    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        trajs_frames_force = []
        trajs_force = []
        for u in us:
            trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs)
            trajs_frames_force.append(trajs_frames)
            trajs_force.append(trajs)
        # make a train/test split
        split_ix = int(samples * test_split)
        tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 50, 50)
        data['x'], data['test_x'] = tmp[:,:,:split_ix,:,:], tmp[:,:,split_ix:,:,:]
        tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3)
        data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:]

        data['t'] = tspan
        data['us'] = us

        to_pickle(data, path)
    return data
예제 #2
0
def cut_dataset(model, alpha, dest, dataset_path, ds_name):
    dataset = utils.open_pickle(dataset_path)
    try: 
        dataset = dataset.sel(
            level=slice(np.min(model.tl_model.unique_pressure_lvls),np.max(model.tl_model.unique_pressure_lvls)), 
            lat=slice(model.tl_model.LAT_N, model.tl_model.LAT_S), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E),
            time=slice('1999', '2019'))
    except ValueError:
        dataset = dataset.sel(
            lat=slice(model.tl_model.LAT_S, model.tl_model.LAT_N), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E),
            time=slice('1999', '2019'))
    if model.tl_model.period == "NE_mon":
        dataset = dataset.sel(time=is_NE_mon(dataset['time.month']))
    elif model.tl_model.period == "SW_mon":
        dataset = dataset.sel(time=is_SW_mon(dataset['time.month']))
    elif model.tl_model.period == "inter_mon":
        dataset = dataset.sel(time=is_inter_mon(dataset['time.month']))

    if alpha != model.ALPHAs:
        gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI]
        train_years = np.delete(model.tl_model.years, np.arange((alpha-1) * model.PSI, alpha * model.PSI))
        test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years))
        train = utils.cut_year(dataset, np.min(train_years), np.max(train_years))
    else:
        gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI+model.runoff_years] 
        train_years = np.delete(model.tl_model.years, np.arange((alpha-1)*model.PSI, alpha*model.PSI+model.runoff_years)) 
        test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years))
        train = utils.cut_year(dataset, np.min(train_years), np.max(train_years))
    time.sleep(1); gc.collect()

    utils.to_pickle(f'{ds_name}_test_alpha_{alpha}_preprocessed', test, dest)
    utils.to_pickle(f'{ds_name}_train_alpha_{alpha}_preprocessed', train, dest)
예제 #3
0
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, 
                us=[0], name='acrobot-gym-image-dataset-rgb-0.pkl', **kwargs):
    data = {}

    assert save_dir is not None
    path = save_dir + '/' + name
    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        trajs_frames_force = []
        trajs_force = []
        for u in us:
            trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs)
            trajs_frames = (np.moveaxis(trajs_frames, -1, -3) / 255.0)
            trajs_frames_force.append(trajs_frames)
            trajs_force.append(trajs)
        # make a train/test split
        split_ix = int(samples * test_split)
        tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 3, 64, 64)
        data['x'], data['test_x'] = tmp[:,:,:split_ix], tmp[:,:,split_ix:]
        tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3)
        data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:]

        data['t'] = tspan
        data['us'] = us

        to_pickle(data, path)
    return data
예제 #4
0
def save_preloaded_raw_rf_data(model):
    rf_fpaths = get_files(model.raw_rf_dir)

    ds_RAINFALL = xr.open_mfdataset(rf_fpaths)

    utils.to_pickle('ds_RAINFALL', ds_RAINFALL, model.raw_rf_dir)
    return ds_RAINFALL
예제 #5
0
def main():
    # Run the main application
    model_idx = train_ffnn_models()
    mse, mae, mape = run_ensemble(model_idx + 1)

    results = {"mse": mse, "mae": mae, "mape": mape}
    print results

    utils.to_pickle(Config.output_path + "results", results)
예제 #6
0
def main(input_fname, output_fname):
    guide_to_seqnames = from_pickle(input_fname)

    fingerprinted_guides = [
        (fingerprint(guide), guide) for guide in guide_to_seqnames.keys()
    ]
    fingerprinted_guides.sort(key=lambda a: a[0])
    sorted_guides = [guide for _, guide in fingerprinted_guides]

    to_pickle(output_fname, sorted_guides)
예제 #7
0
    def train_SOM(self, alpha=None):
        d_hp_dir_path = str(utils.models_dir / self.dir_hp_str)
        self.d_hp_dir_path = d_hp_dir_path
        os.makedirs(d_hp_dir_path, exist_ok=True)
        if not utils.find(f'*extent_{self.dir_str}.png', self.d_hp_dir_path):
            visualization.get_domain_geometry(self, self.d_hp_dir_path)
            
        models_dir_path = str(utils.models_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}'
        os.makedirs(models_dir_path, exist_ok=True)
        self.models_dir_path = models_dir_path
        # utils.update_cfgfile('Paths', 'models_dir_path', self.models_dir_path)

        if alpha:
            destination = self.alpha_model_dir
            arr_path = self.alpha_standardized_stacked_arr_path
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            destination = self.models_dir_path
            arr_path = self.standardized_stacked_arr_path
            prefix = ''
            prompt = ''

        print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}"')

        if utils.find(f'*{prefix}som_model.pkl', destination):
            print(f'{utils.time_now()} - SOM model trained before, skipping...')
            self.som_model_path = utils.find(f'*{prefix}som_model.pkl', destination)[0]
        else:
            print(f'{utils.time_now()} - {prompt} No SOM model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...')

            standardized_stacked_arr = utils.open_pickle(arr_path)

            sominitstarttime = timer(); print(f'{utils.time_now()} - Initializing MiniSom... ')
            som = MiniSom(self.gridsize, self.gridsize, # square
                        standardized_stacked_arr.shape[1],
                        sigma=self.sigma, learning_rate=self.learning_rate,
                        neighborhood_function='gaussian', random_seed=self.random_seed)
            """
            Note: initializing PCA for weights is faster (~1/2 hour), but for serialized arrays > 300mb, 
            chances are this will kill the RAM and halt the entire process. 
            """
##            try:
##                som.pca_weights_init(standardized_stacked_arr)
##            except MemoryError as e:
##                print(f'Memory error has occured: \n{e}')
            print(f"Initialization took {utils.time_since(sominitstarttime)}.\n")

            trainingstarttime = timer(); print(f"{utils.time_now()} - Beginning training.")
            getattr(som, self.training_mode)(standardized_stacked_arr, self.iterations, verbose=True)
            q_error = np.round(som.quantization_error(standardized_stacked_arr), 2)
            print(f"Training complete. Q error is {q_error}, time taken for training is {utils.time_since(trainingstarttime)}s\n")

            if alpha: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
            else: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
예제 #8
0
def get_dataset(experiment_name, save_dir, **kwargs):
    '''Returns a PDE dataset.'''
    path = '{}/{}-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print(
            "Had a problem loading data from {}. Rebuilding dataset...".format(
                path))
        data = make_dataset(experiment_name, **kwargs)
        to_pickle(data, path)
        os.makedirs('{}/data/'.format(save_dir), exist_ok=True)
        import matplotlib as mpl
        mpl.use('Agg')
        import matplotlib.pyplot as plt
        u_all = np.concatenate([data['u'], data['test_u']], axis=0)
        energy_all = np.concatenate([data['energy'], data['test_energy']],
                                    axis=0)
        mass_all = u_all.sum(-1).squeeze(-1)
        for idx in range(len(u_all)):
            u = u_all[idx]
            energy = energy_all[idx]
            mass = mass_all[idx]
            fig, (ax1, ax2, ax3) = plt.subplots(3,
                                                1,
                                                sharex=True,
                                                figsize=(6., 6.),
                                                facecolor='white')
            t = data['t_eval']
            M = u.shape[-1]
            y = np.arange(M) / M
            T, Y = np.meshgrid(t, y)
            if experiment_name.startswith('ch'):
                ax1.pcolormesh(T,
                               Y,
                               u.squeeze(1).T,
                               cmap='seismic',
                               vmin=-1,
                               vmax=1)
            else:
                ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic')
            ax1.set_aspect('auto')
            ax1.set_yticks((0 - .5 / M, 1 - .5 / M))
            ax1.set_yticklabels((0, 1))
            ax2.plot(t, energy)
            ax3.plot(t, mass)
            ax3.set_xticks((t[0], t[-1]))
            ax3.set_xticklabels((t[0], t[-1]))
            fig.savefig('{}/data/data_{}_{:02d}.png'.format(
                save_dir, experiment_name, idx))
            plt.close()
    return data
예제 #9
0
def _preprocess_data():
    # Load store sale with state
    all_stores = pd.read_pickle(Config.store_data_path)

    # Add more columns of days of week and days of month
    trn_sls_dte = pd.to_datetime(all_stores.trn_sls_dte)
    all_stores["dayofweek"] = trn_sls_dte.dt.dayofweek
    all_stores["dayofmonth"] = trn_sls_dte.dt.day

    # Load weather data
    weather = pd.read_csv(Config.weather_data_path)

    # Join store sales and weather data
    store_weather = pd.merge(all_stores, weather,
                             left_on=["store_id", "trn_sls_dte"],
                             right_on=["location", "date"])
    store_weather.drop(['date', 'location', 'max_temp', 'min_temp'],
                       axis=1, inplace=True)

    # Sort the dataframe with date_time
    store_weather.sort_values(["store_id", "trn_sls_dte"], inplace=True)

    # Remove store with smaller than 180 days
    store_ids = store_weather.store_id.unique()
    for sid in store_ids:
        c_store = store_weather[store_weather.store_id == sid]
        if c_store.shape[0] < 180:
            store_weather = store_weather[store_weather.store_id != sid]

    # Shift the response variables to the past 1 day
    store_weather["p_total_revenue"] = store_weather.total_revenue.shift(1)
    store_weather["p_total_volume"] = store_weather.total_volume.shift(1)

    # Drop the first day of each store
    store_size = store_weather.groupby(["store_id"]).size()
    row_drop_idx = np.cumsum(store_size) - store_size
    store_weather.drop(store_weather.index[row_drop_idx], axis=0, inplace=True)

    # Backup store_ids
    store_weather["store_id_bk"] = store_weather.store_id

    # Factorize categorical features
    cat_cols = ["dayofweek", "dayofmonth", "state", "isholiday", "store_id"]
    for col in cat_cols:
        store_weather[col] = pd.factorize(store_weather[col])[0]

    # Drop the time column
    store_weather.drop(["trn_sls_dte"], axis=1, inplace=True)

    # Save the preprocessed dataframe to pickle object
    utils.to_pickle(Config.save_dir + "store_weather.pkl", store_weather)

    return store_weather
예제 #10
0
def save_preloaded_raw_input_data(model):
    input_fpaths = get_files(model.raw_input_dir)
    CHOSEN_VARS_ds = []
    for var in model.CHOSEN_VARS:
        CHOSEN_VARS_ds.append([rf"{_}" for _ in input_fpaths if f"{var}" in _])

    ds_CHOSEN_VARS_renamed = xr.open_mfdataset(CHOSEN_VARS_ds, chunks={'time':4}).rename({
        'latitude':'lat', 'longitude':'lon', 'r':'rhum', 'u':'uwnd', 'v':'vwnd'
    })

    utils.to_pickle('ds_CHOSEN_VARS_renamed', ds_CHOSEN_VARS_renamed, model.raw_input_dir)
    return ds_CHOSEN_VARS_renamed
예제 #11
0
def _train_test_split():
    # Build the store_weather dataframe
    store_weather_filename = Config.save_dir + "store_weather.pkl"
    if os.path.exists(store_weather_filename):
        store_weather = utils.from_pickle(store_weather_filename)
    else:
        store_weather = _preprocess_data()

    # Split train test for each store
    train = pd.DataFrame({})
    test = pd.DataFrame({})
    store_ids = store_weather.store_id_bk.unique()
    for sid in store_ids:
        c_store = store_weather[store_weather.store_id_bk == sid]
        s_train = c_store[:-Config.test_size]
        s_test = c_store[-Config.test_size:]
        train = train.append(s_train).reset_index().drop(["index"], axis=1)
        test = test.append(s_test).reset_index().drop(["index"], axis=1)

    # Scale numeric columns
    num_cols = ["p_total_revenue", "p_total_volume", "mean_temp",
                "total_precipitation", "total_snow"]
    scaler = MaxAbsScaler().fit(train.loc[:, num_cols])
    train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols])
    test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols])

    # Scale 2 output columns
    revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]])
    volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]])
    train.loc[:, ["total_revenue"]] = revenue_scale.transform(
        train.loc[:, ["total_revenue"]])
    test.loc[:, ["total_revenue"]] = revenue_scale.transform(
        test.loc[:, ["total_revenue"]])
    train.loc[:, ["total_volume"]] = volume_scale.transform(
        train.loc[:, ["total_volume"]])
    test.loc[:, ["total_volume"]] = volume_scale.transform(
        test.loc[:, ["total_volume"]])

    # Save the train/test dataframes to pickle objects
    utils.to_pickle(Config.save_dir + "train_set.pkl", train)
    utils.to_pickle(Config.save_dir + "test_set.pkl", test)

    # Save the 2 scaler for later use
    utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale)
    utils.to_pickle(Config.save_dir + "volume_scale", volume_scale)

    # Save store_ids
    utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids)

    return train, test
예제 #12
0
def get_dataset(experiment_name, save_dir, **kwargs):
    '''Returns an orbital dataset. Also constructs
    the dataset if no saved version is available.'''

    path = '{}/{}-orbits-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        data = make_orbits_dataset(**kwargs)
        to_pickle(data, path)

    return data
예제 #13
0
    def train_kmeans(self, alpha=None):
        if alpha:
            optimal_k = self.tl_model.optimal_k
            print(f'>> self.alpha_model_dir: {self.alpha_model_dir}')
            print(f'>> optimal_k: {optimal_k}')
            found = [i for i in Path(self.alpha_model_dir).glob(f'k-{optimal_k}_*')]
            if found: 
                self.alpha_cluster_dir = found[0]
            else: self.alpha_cluster_dir = str(Path(self.alpha_model_dir) / f"k-{optimal_k}_NOT-singled-out-as-potential-cluster-for-this-split")
            os.makedirs(self.alpha_cluster_dir, exist_ok=True)
            print(f'>> self.alpha_cluster_dir: {self.alpha_cluster_dir}')
            destination = self.alpha_cluster_dir
            prefix = f'alpha_{alpha}_'
        else:
            optimal_k = self.optimal_k
            destination = self.cluster_dir
            prefix = ''

        print(f'optimal_k: "{optimal_k}", destination: "{destination}", prefix: "{prefix}"')
            
        for phrase in ('kmeans_model', 'labels_ar', 'labels_to_coords', 'label_markers', 'target_ds_withClusterLabels', 'dates_to_ClusterLabels', 'RFprec_to_ClusterLabels_dataset'):
            if utils.find(f'*{phrase}*.pkl', destination): 
                print(f'>>>>>>>>> "self.{phrase}_path" initialized.')
                exec(f'self.{phrase}_path = utils.find(f\'*{phrase}*.pkl\', r"{destination}")[0]')
            else:
                print(f'{utils.time_now()} - No KMeans model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...')
                som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path)
                samples, features = som_weights_to_nodes.shape
                km = KMeans(n_clusters=optimal_k).fit(som_weights_to_nodes)
                print(f"n{utils.time_now()} - K-means estimator fitted, sample size is {samples} and number of features is {features}.")

                self.kmeans_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}kmeans_model', km, destination)
                self.serialize_kmeans_products(km, alpha)
                break
예제 #14
0
def preprocess_time_series(model, dest, nfold_ALPHA=None, desired_res=0.75):   
    """
    Preparing datasets for use in training algorithms
    - dropping missing values
    - ensuring both target & input datasets have same dates 
    - coarsening spatial resolution of rainfall(target) dataset to desired resolution
    - pickling these "preprocessed" datasets
    """
    target_ds = utils.open_pickle(model.input_ds_serialized_path)
    rf_target_ds = utils.open_pickle(model.rf_ds_serialized_path)

    # removing NA rows, supraneous dates, & coarsening dates accordingly
    print(f'{utils.time_now()} - Preprocessing data now.')
    
    try:
        rf_target_ds['time'] =  rf_target_ds.indexes['time'].to_datetimeindex() #converting CFTimeIndex -> DateTime Index 
    except AttributeError:
        print('AttributeError: \'DatetimeIndex\' object has no attribute \'to_datetimeindex\', continuing regardless...')
        pass

    earliest_rf_reading, latest_rf_reading =  rf_target_ds.isel(time=0).time.values,  rf_target_ds.isel(time=-1).time.values
    earliest_target_ds_reading, latest_target_ds_reading = target_ds.isel(time=0).time.values, target_ds.isel(time=-1).time.values
    earliest_date = earliest_target_ds_reading if earliest_target_ds_reading > earliest_rf_reading else earliest_rf_reading
    latest_date = latest_target_ds_reading if latest_target_ds_reading < latest_rf_reading else latest_rf_reading

    rf_ds_preprocessed =  rf_target_ds.sel(time=slice(earliest_date, latest_date))
    target_ds = target_ds.sel(time=slice(earliest_date, latest_date))

    more_time_gaps = [i for i in target_ds.time.data if i not in rf_ds_preprocessed.time.data]
    more_time_gaps = more_time_gaps+[i for i in rf_ds_preprocessed.time.data if i not in target_ds.time.data]
    valid_dates = [date for date in target_ds.time.data if date not in more_time_gaps]
    target_ds = target_ds.sel(time = valid_dates)
    coarsen_magnitude = int(desired_res/np.ediff1d(target_ds.isel(lon=slice(0,2)).lon.data)[0])
    print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}')
    target_ds_preprocessed = target_ds.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean()
        
    target_ds_preprocessed_path = utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, dest)
    rf_ds_preprocessed_path = utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, dest)

    target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed)

    if nfold_ALPHA:
        for alpha in range(nfold_ALPHA):
            pass

    return target_ds_preprocessed_path, rf_ds_preprocessed_path
예제 #15
0
    def get_dataset(self, experiment_name, save_dir):
        '''Returns the trajectory dataset. Also constructs
           the dataset if no saved version is available.'''

        path = '{}/{}-orbits-dataset_{}_EnsemblesPerEnergy_{}_OrbitLen_{}_Resolution_{}_energyPoints{}.pkl'.format(
            save_dir, experiment_name, self.integrator, self.ensembles,
            self.tspan[1], self.time_points, self.energyPoints)
        #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_5000_Resolution_50000_energyPoints20.pkl"
        #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_1000_Resolution_10000_energyPoints20.pkl"
        try:
            data = from_pickle(path)
            print("Successfully loaded data from {}".format(path))
        except:
            print("Had a problem loading data from {}. Rebuilding dataset...".
                  format(path))
            data = self.make_orbits_dataset()
            to_pickle(data, path)

        return data
예제 #16
0
def select_category_value_agg(base,
                              df,
                              key,
                              category_col,
                              value,
                              method,
                              path='../features/1_first_valid',
                              ignore_list=[],
                              prefix='',
                              null_val='XNA'):
    '''
    Explain:
        likelifood encoding
        カテゴリカラムの各内容における行に絞った上でlevel粒度の集計を行い、特徴量を作成する。
        カテゴリカラムの内容を無視して集約するのではなく、カテゴリの内容毎に集計して、
        粒度に対する各カテゴリの統計量を横持ちさせる。
    Args:
    Return:
    '''

    df = df[[level, category_col, value]]
    ' カテゴリカラムにNullがある場合はXNAとして集計する '
    df[category_col].fillna(null_val, inplace=True)

    ' 集計するカテゴリカラムの中身 '
    for cat_val in df[category_col].drop_duplicates().values:

        ' 集計するカテゴリに絞る '
        df_cat = df.query("{category_col} == '{cat_val}'")

        if df_cat[value].dtype != 'object':

            logger.info(
                f"\ncat: {category_col}\ncat_val: {cat_val}\nval: {value}\nmethod: {method}"
            )

            result = base_aggregation(df_cat, level, value, method)
            result = base.merge(result, on=level, how='left')

            feature_list = [col for col in result.columns if col.count('@')]
            for feature in feature_list:
                utils.to_pickle(path=f'{path}/{prefix}{feature}.fp',
                                obj=result[feature].values)
def main(input_fnames, output_fname):
    fnames = []
    for input_sequence in input_fnames:
        fnames.extend(glob(input_sequence))

    name_to_seqs = defaultdict(list)
    for fname in fnames:
        for seq_record in SeqIO.parse(fname, "fasta"):
            sequence = str(seq_record.seq).replace("-", "")
            name_to_seqs[seq_record.name].append(sequence)

    print("Creating database")
    guide_to_seqname = defaultdict(set)
    for seqname, sequences in name_to_seqs.items():
        for sequence in sequences:
            for i in range(len(sequence) - 22):
                guide = sequence[i:i + 22]
                guide_to_seqname[guide].add(seqname)

    to_pickle(output_fname, guide_to_seqname)
예제 #18
0
    def assign_test_clusters_to_datasets(self):
        target_ds_preprocessed = utils.open_pickle(utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir)[0])
        rf_ds_preprocessed = utils.open_pickle(utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir)[0])
        standardized_stacked_arr = utils.open_pickle(utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir)[0])
        
        self.n_datapoints = target_ds_preprocessed.time.shape[0] # length of xr_dataset
        self.lat_size = target_ds_preprocessed.lat.shape[0]
        self.lon_size = target_ds_preprocessed.lon.shape[0]
        self.months = np.unique(target_ds_preprocessed['time.month'].values) # month numbers
        self.month_names = [calendar.month_name[m][:3] for m in np.unique(target_ds_preprocessed['time.month'])]
        self.month_names_joined = '_'.join(self.month_names).upper() # to print months properly
        self.years = np.unique(target_ds_preprocessed['time.year'].values) # unique years
        self.X, self.Y = target_ds_preprocessed.lon, target_ds_preprocessed.lat

        km = utils.open_pickle(self.kmeans_model_path)
        predicted_clusters = km.predict(standardized_stacked_arr.astype(np.float))
        target_ds_withClusterLabels = target_ds_preprocessed.assign_coords(cluster=("time", predicted_clusters))
        dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords()
        RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels])
        utils.to_pickle('target_ds_withClusterLabels', target_ds_withClusterLabels, self.test_prepared_data_dir)
        utils.to_pickle('RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, self.test_prepared_data_dir)
예제 #19
0
    def serialize_kmeans_products(self, km, alpha):
        if alpha:
            arr_path = self.alpha_standardized_stacked_arr_path
            uniq_markers = self.tl_model.uniq_markers
            destination = self.alpha_cluster_dir
        else:
            arr_path = self.standardized_stacked_arr_path
            uniq_markers = self.uniq_markers
            destination = self.cluster_dir

        print(f'arr_path: "{arr_path}", uniq_markers: "{uniq_markers}", destination: "{destination}"')

        standardized_stacked_arr = utils.open_pickle(arr_path)
        target_ds = utils.open_pickle(self.target_ds_preprocessed_path)
        rf_ds_preprocessed = utils.open_pickle(self.rf_ds_preprocessed_path)

        labels_ar = km.labels_
        labels_to_coords = np.zeros([len(labels_ar), 2])
        for i, var in enumerate(labels_ar): labels_to_coords[i] = i % self.gridsize, i // self.gridsize

        try:
            label_markers = np.array([uniq_markers[var] for i, var in enumerate(labels_ar)])
        except IndexError: # more than 12 clusters
            label_markers = np.array([(uniq_markers*3)[var] for i, var in enumerate(labels_ar)])
            
        target_ds_withClusterLabels = target_ds.assign_coords(cluster=("time", km.predict(standardized_stacked_arr.astype(np.float))))
        dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords()
        RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels])

        self.labels_ar_path = utils.to_pickle(f'{self.RUN_datetime}_labels_ar', labels_ar, destination)
        self.labels_to_coords_path = utils.to_pickle(f'{self.RUN_datetime}_labels_to_coords', labels_to_coords, destination)
        self.label_markers_path = utils.to_pickle(f'{self.RUN_datetime}_label_markers', label_markers, destination)
        self.target_ds_withClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_target_ds_withClusterLabels', target_ds_withClusterLabels, destination)
        self.dates_to_ClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_dates_to_ClusterLabels', dates_to_ClusterLabels, destination)
        self.RFprec_to_ClusterLabels_dataset_path = utils.to_pickle(f'{self.RUN_datetime}_RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, destination)
예제 #20
0
def NFoldcrossvalidation_eval(alpha_level_model):
    """
    Sequence of evaluation procedure
    1. prepare_nfold_datasets(): split into years according to PSI & PSI_overlap
    2. check_evaluated(): skip in scores already generated for that split/n^th-fold
    3. else, for this alpha level = 
    - detect_serialized_datasets(): is there such data
    - detect_prepared_datasets(): is there such cleaned datasets
    - train_SOM() & detect_som_products(): has SOM model & products already been generated
    - generate_k(): using internal val. to create folders of clustering quality
    - train_kmeans(): using optimal_k to generate kmeans model
    - print_outputs(): print outputs to see differences across the ALPHAs
    - evaluation_procedure(): generating AUC, Brier, etc. scores for this alpha split
    4. compile_scores(): generating an overall score for this clustering attempt (domain, period, hpparam)
    """
    finished_evaluation = alpha_level_model.check_evaluated()
    if finished_evaluation:
        print(
            f'Evaluation has already been finished & you can find the evaluation results @:\n{finished_evaluation}'
        )
        return

    print(f'Commencing evaluation!')
    alpha_level_model.prepare_nfold_datasets()

    for alpha in range(1, alpha_level_model.ALPHAs + 1):
        evaluated = alpha_level_model.check_evaluated(alpha)
        if evaluated:
            continue

        alpha_level_model.prepare_alphafold_dataset(alpha)
        alpha_level_model.train_SOM(alpha)
        alpha_level_model.detect_som_products(alpha)
        alpha_level_model.generate_k(alpha)
        alpha_level_model.train_kmeans(alpha)
        #alpha_level_model.print_outputs(alpha)
        alpha_level_model.evaluation_procedure(alpha)
    utils.to_pickle('alpha_level_model', alpha_level_model,
                    alpha_level_model.alpha_general_dir)
    alpha_level_model.compile_scores()
예제 #21
0
def get_dataset(experiment_name, save_dir, **kwargs):
  '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs
  the dataset if no saved version is available.'''
  
  if experiment_name == "pendulum":
    env_name = "Pendulum-v0"
  elif experiment_name == "acrobot":
    env_name = "Acrobot-v1"
  else:
    assert experiment_name in ['pendulum']
    
  path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name)

  try:
      data = from_pickle(path)
      print("Successfully loaded data from {}".format(path))
  except:
      print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
      data = make_gym_dataset(**kwargs)
      to_pickle(data, path)

  return data
예제 #22
0
    def detect_som_products(self, alpha=None):
        if alpha:
            destination = self.alpha_model_dir
            arr_path = self.alpha_standardized_stacked_arr_path
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            destination = self.models_dir_path
            arr_path = self.standardized_stacked_arr_path
            prefix = ''
            prompt = ''

        print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}", prompt:"{prompt}"')

        for phrase in ('winner_coordinates', 'dmap', 'ar', 'som_weights_to_nodes'):
            if utils.find(f'*{prefix}{phrase}.pkl', destination): 
                p = utils.find(f'*{prefix}{phrase}.pkl', destination)
                print(f'{utils.time_now()} - {prefix}{phrase} is found @: \n{p[0]}')
                exec(f'self.{phrase}_path = {p}[0]')
            else:
                print(f'{utils.time_now()} - {prompt} Some SOM products found missing in, generating all products now...')
                som = utils.open_pickle(self.som_model_path)
                standardized_stacked_arr = utils.open_pickle(arr_path)
                winner_coordinates = np.array([som.winner(x) for x in standardized_stacked_arr]) 
                dmap = som.distance_map()
                ar = som.activation_response(standardized_stacked_arr) 
                som_weights = som.get_weights() # weights for training via k-means
                som_weights_to_nodes = np.array(
                    [som_weights[c,r] for r in range(self.gridsize) for c in range(self.gridsize)]) #kmeans clustering
                
                self.winner_coordinates_path = utils.to_pickle(f'{prefix}winner_coordinates', winner_coordinates, destination)
                self.dmap_path = utils.to_pickle(f'{prefix}dmap', dmap, destination)
                self.ar_path = utils.to_pickle(f'{prefix}ar', ar, destination)
                self.som_weights_to_nodes_path = utils.to_pickle(f'{prefix}som_weights_to_nodes', som_weights_to_nodes, destination)

                break

        print('SOM products serialized.')
예제 #23
0
def prepare_dataset(model, dest):
    """
    - xr.open_mfdataset() = loading
    - restricting to certain variables + "levels" of variables
    - combining variables xarrays into one
    - restricting to only between 1999 to 2019
    - slicing domain dimensions up to required specs (i.e. model.LON_S, model.LON_N, etc...)
    - slicing up to chosen period only
    - pickling the datasets (both input & rainfall) & returning them
    """
    # searching for raw data pickles
    preloaded_input_pickles = utils.find('*.pkl', model.raw_input_dir)
    if preloaded_input_pickles:
        print('Preloaded raw INPUT data pickles found...')
        ds_CHOSEN_VARS_renamed = utils.open_pickle(utils.find('*.pkl', model.raw_input_dir)[0])
    else: 
        print('Creating pickles of raw input data...')
        ds_CHOSEN_VARS_renamed = save_preloaded_raw_input_data(model)
    
    preloaded_input_pickles = utils.find('*.pkl', model.raw_rf_dir)
    if preloaded_input_pickles:
        print('Preloaded raw rainfall data pickles found...')
        ds_RAINFALL = utils.open_pickle(utils.find('*.pkl', model.raw_rf_dir)[0])
    else: 
        print('Creating pickles of raw rainfall data...')
        ds_RAINFALL = save_preloaded_raw_rf_data(model)

    print("Proceeding to do preliminary data cleaning...")
    ds_sliced = ds_CHOSEN_VARS_renamed.sel(
        level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), 
        lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E),
        time=slice('1999', '2019'))
    ds_sliced_rhum = ds_sliced.rhum
    ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925})
    ds_sliced_uwnd_only = ds_sliced.uwnd
    ds_sliced_vwnd_only = ds_sliced.vwnd
    ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override')

    rf_ds_sliced = ds_RAINFALL.sel(lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E))
    print('Pickling domain- & feature-constrained input & RF datasets...')
    if model.period == "NE_mon":
        input_ds = ds_combined_sliced.sel(time=is_NE_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_NE_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_NE_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_NE_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
    elif model.period == "SW_mon":
        input_ds = ds_combined_sliced.sel(time=is_SW_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_SW_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_SW_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_SW_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
    elif model.period == "inter_mon":
        input_ds = ds_combined_sliced.sel(time=is_inter_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_inter_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_inter_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_inter_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
예제 #24
0
def get_dataset(experiment_name, save_dir, u, **kwargs):
    '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs
  the dataset if no saved version is available.'''

    if experiment_name == "pendulum":
        env_name = "Pendulum-v0"
    elif experiment_name == "acrobot":
        env_name = "Acrobot-v1"
    elif experiment_name == "cartpole":
        env_name = 'My_FA_CartPole-v0'
    else:
        assert experiment_name in ['pendulum', 'acrobot', 'cartpole']

    path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print(
            "Had a problem loading data from {}. Rebuilding dataset...".format(
                path))

        data = {}
        for u_ in u:
            data_ = make_gym_dataset(u=u[0], **kwargs)
            for k, v in data_.items():
                if k in ['meta']:
                    continue

                new = data_[k]
                old = data.get(k, np.array([]).reshape(0, new.shape[1]))
                data[k] = np.vstack((old, data_[k]))

        to_pickle(data, path)

    return data
예제 #25
0
def flatten_and_standardize_dataset(model, dest):

    target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path)
    target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed)
    
    # reshaping
    reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...")
    print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.")

    reshaped_unnorma_darrays = {}
    reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {}

    for level in model.rhum_pressure_levels:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['rhum'][level] = np.reshape(
            target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.")

    for level in model.uwnd_vwnd_pressure_lvls:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['uwnd'][level] = np.reshape(
            target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))
        reshaped_unnorma_darrays['vwnd'][level] = np.reshape(
            target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n')

    # stacking unstandardized dataarrays
    stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...")
    stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]])

    stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n')

    # standardizing the stacked dataarrays
    standardizestarttime = timer(); print("standardizing stacked dataarrays now...")
    print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}')
    transformer = RobustScaler(quantile_range=(25, 75))
    standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training
    transformer.get_params()
    standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n')

    standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest)

    return standardized_stacked_arr_path
예제 #26
0
    def serialize(self, obj, **kw):
        """
        Function for serializing object => string.
        This can be overwritten for custom 
        uses.

        The default is to do nothing ('serializer'=None)
        If the connection is intialized with 'serializer' set to 
        'json.gz', 'json', 'gz', or 'zip', we'll do the 
        transformations.
        """
        serializer = kw.get('serializer',  self._serializer)

        if serializer == "json.gz":
            return utils.to_gz(utils.to_json(obj))
        
        elif serializer == "json":
            return utils.to_json(obj)

        elif serializer == "gz":
            assert(isinstance(obj, basestring))
            return utils.to_gz(obj)

        elif serializer == "zip":
            assert(isinstance(obj, basestring))
            return utils.to_zip(obj)

        elif serializer == "pickle":
            return utils.to_pickle(obj)

        elif serializer is not None:

            raise NotImplementedError(
                'Only json, gz, json.gz, zip, and pickle'
                'are supported as serializers.')

        return obj
예제 #27
0
    train_loss = torch.cat(train_loss, dim=1)
    train_loss_per_traj = torch.sum(train_loss, dim=(0,2))

    test_loss = torch.cat(test_loss, dim=1)
    test_loss_per_traj = torch.sum(test_loss, dim=(0,2))

    print('Final trajectory train loss {:.4e} +/- {:.4e}\nFinal trajectory test loss {:.4e} +/- {:.4e}'
    .format(train_loss_per_traj.mean().item(), train_loss_per_traj.std().item(),
            test_loss_per_traj.mean().item(), test_loss_per_traj.std().item()))

    stats['traj_train_loss'] = train_loss_per_traj.detach().cpu().numpy()
    stats['traj_test_loss'] = test_loss_per_traj.detach().cpu().numpy()

    return model, stats


if __name__ == "__main__":
    args = get_args()
    model, stats = train(args)

    # save 
    os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None
    label = '-baseline_ode' if args.baseline else '-hnn_ode'
    struct = '-struct' if args.structure else ''
    rad = '-rad' if args.rad else ''
    path = '{}/{}{}{}-{}-p{}{}.tar'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points, rad)
    torch.save(model.state_dict(), path)
    path = '{}/{}{}{}-{}-p{}-stats{}.pkl'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points, rad)
    to_pickle(stats, path)
예제 #28
0
        else:
            x_stack.append(x[:, i:])
    x_stack = np.stack(x_stack, axis=1) # n_u, n_p, ts+1-n_p, bs, 32, 32
    x_stack = np.reshape(x_stack, 
                (x.shape[0], num_points, -1, *x.shape[3:])) # n_u, n_p, (ts+1-n_p)*bs, 32, 32
    t_eval = t[0:num_points]
    return x_stack, t_eval

if __name__ == "__main__":
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    # load data
    us = [[0.0, 0.0], [0.0, 1.0], [0.0, -1.0], [0.0, 2.0], [0.0, -2.0],
            [1.0, 0.0], [-1.0, 0.0], [2.0, 0.0], [-2.0, 0.0]]
    # us = [[0.0, 0.0]]
    ts = 20 ; ss = 512
    data = get_dataset(seed=0, timesteps=ts,
                save_dir=THIS_DIR, us=us, samples=ss, test_split=0.50,
                name='cartpole-gym-image-dataset-rgb-u9.pkl')

    train_data = {}
    train_data['x'] = data['x']
    train_data['obs'] = data['obs']
    train_data['t'] = data['t']
    train_data['us'] = data['us']
    to_pickle(train_data, THIS_DIR + '/' + 'cartpole-gym-image-dataset-rgb-u9-train.pkl')
    test_data = {}
    test_data['x'] = data['test_x']
    test_data['obs'] = data['test_obs']
    test_data['t'] = data['t']
    test_data['us'] = data['us']
    to_pickle(test_data, THIS_DIR + '/' + 'cartpole-gym-image-dataset-rgb-u9-test.pkl')
예제 #29
0
            x_stack.append(x[:, i:])
    x_stack = np.stack(x_stack, axis=1) # n_u, n_p, ts+1-n_p, bs, 32, 32
    x_stack = np.reshape(x_stack, 
                (x.shape[0], num_points, -1, *x.shape[3:])) # n_u, n_p, (ts+1-n_p)*bs, 32, 32
    t_eval = t[0:num_points]
    return x_stack, t_eval

if __name__ == "__main__":
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    np.random.seed(0)
    # load data
    us = [[0.0, 0.0], [0.0, 1.0], [0.0, -1.0], [0.0, 2.0], [0.0, -2.0],
            [1.0, 0.0], [-1.0, 0.0], [2.0, 0.0], [-2.0, 0.0]]
    # us = [[0.0, 0.0]]
    ts = 20 ; ss = 512
    data = get_dataset(seed=0, timesteps=ts,
                save_dir=THIS_DIR, us=us, samples=ss, test_split=0.50,
                name='acrobot-gym-image-dataset-rgb-u9.pkl')

    train_data = {}
    train_data['x'] = data['x']
    train_data['obs'] = data['obs']
    train_data['t'] = data['t']
    train_data['us'] = data['us']
    to_pickle(train_data, THIS_DIR + '/' + 'acrobot-gym-image-dataset-rgb-u9-train.pkl')
    test_data = {}
    test_data['x'] = data['test_x']
    test_data['obs'] = data['test_obs']
    test_data['t'] = data['t']
    test_data['us'] = data['us']
    to_pickle(test_data, THIS_DIR + '/' + 'acrobot-gym-image-dataset-rgb-u9-test.pkl')
예제 #30
0
    def generate_k(self, alpha=None):
        """
        - detection of metrices to infer "k", i.e. optimal_k value
        - creation of metrices pickles 
        - creation of folders in models_dir to indicate potential k values/cluster combinations
        """
        metrics_dir = str(utils.metrics_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}'
        os.makedirs(metrics_dir, exist_ok=True)
        self.metrics_dir_path = metrics_dir

        if alpha:
            self.alpha_metrics_dir_path = str(Path(self.tl_model.metrics_dir_path) / f'alpha_{alpha}')
            metric_destination = self.alpha_metrics_dir_path
            os.makedirs(metric_destination, exist_ok=True)
            model_destination = self.alpha_model_dir
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            metric_destination = self.metrics_dir_path
            model_destination = self.models_dir_path
            prefix = ''
            prompt = ''

        print(f'metric_destination: "{metric_destination}", model_destination: "{model_destination}", prefix: "{prefix}", prompt:"{prompt}"')
        
        for phrase in ('sil_peaks', 'ch_max', 'dbi_min', 'reasonable_sil', 'ch_dbi_tally', 'n_expected_clusters', 'dbs_err_dict'):
            if utils.find(f'*{prefix}{phrase}*.pkl', metric_destination): pass
            else:
                print(f'{utils.time_now()} - {prompt} Not all metrices have been found in {metric_destination}, generating them now...')
                # print all metrices if even 1 not found
                som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path)

                ch_scores, dbi_scores = validation.print_elbow_CH_DBI_plot(self, som_weights_to_nodes, metric_destination)
                yellowbrick_expected_k = validation.print_yellowbrickkelbow(self, som_weights_to_nodes, metric_destination)
                silhouette_avgs, reasonable_silhoutte_scores_mt50 = validation.print_silhoutte_plots(self, som_weights_to_nodes, metric_destination)
                dbstop10 = validation.print_dbs_plots(self, som_weights_to_nodes, metric_destination)
                
                eps_ls, dbs_k_ls, dbs_noisepts_ls, dbs_labels = [], [], [], []
                for i in dbstop10:
                    eps_ls.append(i[0])
                    dbs_k_ls.append(i[1])
                    dbs_noisepts_ls.append(i[2])
                    dbs_labels.append(i[3])

                sil_peaks, ch_max, dbi_min, reasonable_sil, ch_dbi_tally, n_expected_clusters, dbs_err_dict = validation.get_cluster_determination_vars(
                    silhouette_avgs, ch_scores, dbi_scores, reasonable_silhoutte_scores_mt50, dbs_k_ls, dbs_noisepts_ls, yellowbrick_expected_k)

                for cluster_num in n_expected_clusters:
                    if alpha: save_dir = fr"{self.alpha_model_dir}/k-{cluster_num}"
                    else: save_dir = fr"{self.models_dir_path}/k-{cluster_num}"
                    
                    if cluster_num == ch_max: save_dir += '_CHhighestPeak'
                    if cluster_num == dbi_min: save_dir += '_lowestDBItrough'
                    if cluster_num in sil_peaks: save_dir += '_SilhouetteAVG-peak'
                    if cluster_num == reasonable_sil: save_dir += '_mostReasonable-basedon-Silhouetteplot'
                    if cluster_num in ch_dbi_tally: save_dir += '_CHpeak-and-DBItrough'
                    if cluster_num == yellowbrick_expected_k: save_dir += '_Yellowbrickexpected-K'
                    if cluster_num in dbs_err_dict: save_dir += f'_DBSCANclusterErrorValsExpected-{dbs_err_dict[cluster_num]}'

                    os.makedirs(save_dir, exist_ok=True)
                    print(f'save_dir: {save_dir}')

                self.ch_max_path = utils.to_pickle(f"{prefix}ch_max", ch_max, metric_destination)
                self.dbi_min_path = utils.to_pickle(f"{prefix}dbi_min", dbi_min, metric_destination)
                self.sil_peaks_path = utils.to_pickle(f"{prefix}sil_peaks", sil_peaks, metric_destination)
                self.reasonable_sil_path = utils.to_pickle(f"{prefix}reasonable_sil", reasonable_sil, metric_destination)
                self.ch_dbi_tally_path = utils.to_pickle(f"{prefix}ch_dbi_tally", ch_dbi_tally, metric_destination)
                self.yellowbrick_expected_k_path = utils.to_pickle(f"{prefix}yellowbrick_expected_k", yellowbrick_expected_k, metric_destination)
                self.dbs_err_dict_path = utils.to_pickle(f"{prefix}dbs_err_dict", dbs_err_dict, metric_destination)
                self.n_expected_clusters_path = utils.to_pickle(f"{prefix}n_expected_clusters", n_expected_clusters, metric_destination)

                break

        print(f'{utils.time_now()} - Internal validation of clusters has been run, please view metrices folder @:\n{metric_destination} to determine optimal cluster number.\n'\
            f'\nYou can view the separate folders constructed for each discovered cluster combination. See @: \n{model_destination}.')
예제 #31
0
                                                  '/results') else None
    label = ''
    label = label + '-{}-{}'.format(args.model, args.solver)
    label = label + '-friction' if args.friction else label
    label = label + '-seed{}'.format(args.seed)
    name = args.name
    result_path = '{}/results/dg-{}{}'.format(args.save_dir, name, label)
    path_tar = '{}.tar'.format(result_path)
    path_pkl = '{}.pkl'.format(result_path)
    path_txt = '{}.txt'.format(result_path)

    if os.path.exists(path_txt):
        if args.noretry:
            exit()
        else:
            os.remove(path_txt)

    model, stats = train(args)
    torch.save(model.state_dict(), path_tar)
    to_pickle(stats, path_pkl)
    with open(path_txt, 'w') as of:
        print(
            '#final_train_loss\tfinal_test_loss\tenergy_mse_mean\tstate_mse_mean',
            file=of)
        print(stats['final_train_loss'],
              stats['final_test_loss'],
              stats['energy_mse_mean'],
              stats['state_mse_mean'],
              sep='\t',
              file=of)