Exemplo n.º 1
0
def flatten_and_standardize_dataset(model, dest):

    target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path)
    target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed)
    
    # reshaping
    reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...")
    print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.")

    reshaped_unnorma_darrays = {}
    reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {}

    for level in model.rhum_pressure_levels:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['rhum'][level] = np.reshape(
            target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.")

    for level in model.uwnd_vwnd_pressure_lvls:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['uwnd'][level] = np.reshape(
            target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))
        reshaped_unnorma_darrays['vwnd'][level] = np.reshape(
            target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n')

    # stacking unstandardized dataarrays
    stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...")
    stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]])

    stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n')

    # standardizing the stacked dataarrays
    standardizestarttime = timer(); print("standardizing stacked dataarrays now...")
    print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}')
    transformer = RobustScaler(quantile_range=(25, 75))
    standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training
    transformer.get_params()
    standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n')

    standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest)

    return standardized_stacked_arr_path
Exemplo n.º 2
0
class RobustScaler(FeatureTransformAlgorithm):
    r"""Implementation of the robust scaler.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler

    See Also:
        * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm`
    """
    Name = 'Robust Scaler'

    def __init__(self, **kwargs):
        r"""Initialize RobustScaler.
        """
        self._params = dict(with_centering=ParameterDefinition([True, False]),
                            with_scaling=ParameterDefinition([True, False]))
        self.__robust_scaler = RS()

    def fit(self, x, **kwargs):
        r"""Fit implemented transformation algorithm.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm.
        """
        self.__robust_scaler.fit(x)

    def transform(self, x, **kwargs):
        r"""Transforms the given x data.

        Arguments:
            x (pandas.core.frame.DataFrame): Data to transform.

        Returns:
            pandas.core.frame.DataFrame: Transformed data.
        """

        return self.__robust_scaler.transform(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureTransformAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__robust_scaler.get_params()))
Exemplo n.º 3
0
def scale_data(trainX, testX):
    """
        Scale data 2D
         :param trainX: (array)
        :param testX: (array)
        :return:
                trainX: (array)
                testX: (array)
        """
    # remove overlap
    cut = int(trainX.shape[1] / 2)
    longX = trainX[:, -cut:, :]
    # flatten windows
    longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2]))
    # flatten train and test
    flatTrainX = trainX.reshape(
        (trainX.shape[0] * trainX.shape[1], trainX.shape[2]))
    flatTestX = testX.reshape(
        (testX.shape[0] * testX.shape[1], testX.shape[2]))
    # standardize
    s = RobustScaler()
    # fit on training data
    s.fit(longX)
    # print("MEAN:")
    # print(s.mean_)
    # print("------------------------------------------")
    # print("VAR:")
    # print(s.var_)
    # print("------------------------------------------")
    # print("STD:")
    # print(s.scale_)

    print(s.get_params(True))
    # apply to training and test data
    longX = s.transform(longX)
    flatTrainX = s.transform(flatTrainX)
    flatTestX = s.transform(flatTestX)
    # reshape
    flatTrainX = flatTrainX.reshape((trainX.shape))
    flatTestX = flatTestX.reshape((testX.shape))
    return flatTrainX, flatTestX
Exemplo n.º 4
0
def classify(datapath,v, normalize=True):#datapath: directory name of the datasets, (v)erbose: True or false, normalize = True normalizes training data
	# Grab both wine datasets in one dataset
	concat_data = get_data(datapath)
	# Bag data to 5 scores 
	recode = {3:0, 4:0, 5:1, 6:2, 7:3, 8:4,9:4}
	concat_data['quality_c'] = bag_data(recode,concat_data,'quality')

	# Split up dataset 70/30 training,testing
	y_wine = concat_data['quality_c']
	X_wine = concat_data.drop(['quality_c','quality'], axis=1)
	X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=420)

	X_train_c , X_test_c = X_train.copy(), X_test.copy() #save test and train X sets for classification

	if normalize:
		# Normalize training examples by removing mean and scaling by interquartile range (better than using s.d=1 for outliers in dataset)
		sclr = RobustScaler()
		X_train = sclr.fit_transform(X_train)
		# Retain Training scale params for scaling test set
		scl_params = sclr.get_params()
		# Normalise test examples using training set normalization params
		sclr = sclr.set_params(**scl_params)
		X_test = sclr.transform(X_test)

	# Set parameters by cross validation
	#==========================================================================================
	# REGRESSION PROBLEM
	#==========================================================================================
	# Multivariate Linear Regression
	clf = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
	clf.fit(X_train, y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nLinear Regression:\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1)
	#==========================================================================================
	# Support Vector Machine(kernel=rbf), Regression
	clf = svm.SVR(C=3,kernel='rbf')
	clf.fit(X_train, y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nSVR :\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1)
	#==========================================================================================
	# NN Regression, default params
	# Grid Search
	h_max = 2 #specify maximum number of hidden layers
	hidden_layer_sizes = build_grid(h_max)
	tuned_param = {'hidden_layer_sizes': hidden_layer_sizes}
	clf = GridSearchCV(neural_network.MLPRegressor(),tuned_param,cv=3) 
	clf.fit(X_train,y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nNNs :\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1)
	print("Best params:", clf.best_params_)
	#==========================================================================================
	# CLASSIFICATION PROBLEM
	#==========================================================================================
	# Restore normalized examples back to original
	X_train, X_test = X_train_c, X_test_c
	# Support Vector Machine(Kernel=rbf), Classification
	clf = svm.SVC(C=3,kernel='rbf',random_state=0)
	clf.fit(X_train, y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nSVC :\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v)
	#==========================================================================================
	# Support Vector Machine(Kernel=rbf), One vs Rest Classification
	clf = OneVsRestClassifier(estimator=svm.SVC(C=3,kernel='rbf', random_state=1))
	clf.fit(X_train, y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nSVC(OneVsRest):\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v)
	#==========================================================================================
	# NN Classification
	# Grid Search
	h_max = 2 #specify maximum number of hidden layers
	hidden_layer_sizes = build_grid(h_max)
	tuned_param = {'hidden_layer_sizes': hidden_layer_sizes}
	clf = GridSearchCV(neural_network.MLPClassifier(),tuned_param,cv=3)
	clf.fit(X_train,y_train)
	# Make Predictions for both sets
	pred_train = clf.predict(X_train)
	pred_test = clf.predict(X_test)
	print('='*100+"\nNNs :\n")
	print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v)
	print("Best params:", clf.best_params_)
Exemplo n.º 5
0
    # Can't use pipelines, unfortunately
    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('NN', baseline_model()),
    ])
    '''

    if not load_weights:

        epochs = 50  # 50 -> 200
        batch_size = 5120
        validation_split = 0.2

        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        params = scaler.get_params()

        model = baseline_model(input_dim, out_dim)

        history = model.fit(
            X_train_scaled,
            y_train,
            epochs = epochs,
            batch_size = batch_size,
            verbose = 0,
            class_weight = class_weight,
            sample_weight = df_train['weight'].values,
        )

        store_model(model, scaler, version=version)
Exemplo n.º 6
0
df_x.head()

# Load y data
if on_colab:
    data_dir = 'SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'
else:
    data_dir = r'F:\temp\thesisdata\SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'

df_y = pd.read_csv(data_dir, sep='\t')
df_y.set_index('FILENAME', inplace=True)

# scale y data
scaler_price = RobustScaler().fit(df_y[['PRICE']].values)
scaler_rating = RobustScaler().fit(df_y[['LIKES_VIEWS_RATIO']].values)

scalar_params_price = scaler_price.get_params(deep=True)
scalar_params_rating = scaler_rating.get_params(deep=True)

scaled_price = scaler_price.transform(df_y[['PRICE']].values)
scaled_rating = scaler_rating.transform(df_y[['LIKES_VIEWS_RATIO']].values)
df_y['PRICE'] = scaled_price
df_y['LIKES_VIEWS_RATIO'] = scaled_rating
df_y.head()

# Join x and y into a single dataframe
df = df_y.join(df_x)
df.head()


class SaatchiDataset(Dataset):
    training_set = df[:13000]
Exemplo n.º 7
0
def prep_for_testing_random_dates(model):
    indp_vars_raw_data_paths = utils.find('*nc', model.test_indp_vars_raw_data_dir)
    RF_raw_data_paths = utils.find('*nc4', model.test_RF_raw_data_dir)
    CHOSEN_VARS_ds = [rf"{path}" for var in model.CHOSEN_VARS for path in indp_vars_raw_data_paths if f"{var}" in path ]
    ds_CHOSEN_VARS_renamed = xr.open_mfdataset(CHOSEN_VARS_ds, chunks={'time':4}).rename({
            'latitude':'lat', 'longitude':'lon', 'r':'rhum', 'u':'uwnd', 'v':'vwnd'
        })
    ds_CHOSEN_VARS_renamed = utils.remove_expver(ds_CHOSEN_VARS_renamed)
    ds_sliced = ds_CHOSEN_VARS_renamed.sel(
            level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), 
            lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E))

    ds_sliced_rhum = ds_sliced.rhum
    ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925})
    ds_sliced_uwnd_only = ds_sliced.uwnd
    ds_sliced_vwnd_only = ds_sliced.vwnd
    ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override')
    ds_RAINFALL = xr.open_mfdataset(RF_raw_data_paths).sel(
        lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E))
    ds_RAINFALL['time'] = ds_RAINFALL.indexes['time'].to_datetimeindex()
    valid_datetimes = [i for i in ds_combined_sliced.time.data if i in ds_RAINFALL.time.data]

    target_ds_preprocessed = ds_combined_sliced.sel(time=valid_datetimes)
    desired_res = .75
    coarsen_magnitude = int(desired_res/np.ediff1d(target_ds_preprocessed.isel(lon=slice(0,2)).lon.data)[0])
    print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}')
    target_ds_preprocessed = target_ds_preprocessed.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean()
    rf_ds_preprocessed = ds_RAINFALL.sel(time=valid_datetimes)
    utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, model.test_prepared_data_dir)
    utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, model.test_prepared_data_dir)

    reshaped_unnorma_darrays = {}
    reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {}

    n_datapoints, lat_size, lon_size = target_ds_preprocessed.time.size, target_ds_preprocessed.lat.size, target_ds_preprocessed.lon.size
    
    for level in model.rhum_pressure_levels:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['rhum'][level] = np.reshape(
            target_ds_preprocessed.rhum.sel(level=level).values, (n_datapoints, lat_size*lon_size ))

    print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.")

    for level in model.uwnd_vwnd_pressure_lvls:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['uwnd'][level] = np.reshape(
            target_ds_preprocessed.uwnd.sel(level=level).values, (n_datapoints, lat_size*lon_size ))
        reshaped_unnorma_darrays['vwnd'][level] = np.reshape(
            target_ds_preprocessed.vwnd.sel(level=level).values, (n_datapoints, lat_size*lon_size ))

    # stacking unstandardized dataarrays
    print("Stacking unstandardized dataarrays now...")
    stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]])

    # standardizing the stacked dataarrays
    print("standardizing stacked dataarrays now...")
    print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}')
    transformer = RobustScaler(quantile_range=(25, 75))
    standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training
    transformer.get_params()

    utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, model.test_prepared_data_dir)