예제 #1
0
# get data
f = h5py.File( data_path + filename , 'r')
N2 = f['N2'][:]
CT = f['CT'][:]
SA = f['SA'][:]
eps = f['eps'][:]
z = f['z'][:]
Np = np.shape(eps)[0]

# *** raw data pdfs ***
#raw_pdf_plot( N2, SA, CT, eps )


# *** remove outliers ***
[N2, SA, CT, eps, z] = remove_outliers( N2, SA, CT, eps, z )


# *** pdfs and stats ***
#pdf_plot( N2, SA, CT, eps )
#[eps_mu,eps_sig,eps_sk,eps_fl,eps_min,eps_max] = pdf( eps )


# *** 2D contour plots ***
#contour_plots( N2, SA, CT, eps )


# *** 3D plots ***
print(type(CT))
zdata = np.log10(eps) 
ydata = CT 
예제 #2
0
import functions as f

# open the dataset
dataset_path = './dataset/'
dataset = pd.read_csv(dataset_path + 'not_standardized_dataset_drop.csv')

plot = False  # plot boolean variable

if plot:
    # plot of 2 histograms
    plt.subplot(121)
    plt.hist(dataset['median_house_value'],
             bins='auto')  # the first one is the raw one
    plt.title('with outliers')

dataset = f.remove_outliers(dataset)  # remove outliers

if plot:
    plt.subplot(122)
    plt.hist(dataset['median_house_value'],
             bins='auto')  # in the second one there is no saturation of data
    plt.title('without outliers')
    plt.draw()

    f.correlation_plot(dataset)

# ridge regression with nested cross validation
X = dataset.drop(columns=['median_house_value']).to_numpy()  # data matrix
y = dataset['median_house_value'].to_numpy()  # labels vector

# add ones column to X
예제 #3
0
# open the dataset
dataset_path = './dataset/'
datasets = [
    'standardized_dataset.csv', 'not_standardized_dataset.csv',
    'standardized_dataset_drop.csv', 'not_standardized_dataset_drop.csv'
]

cv_type = 'nested'

for n, name in enumerate(datasets):
    dataset = pd.read_csv(dataset_path + name)
    scores = []  # list of scores w/ and w/o outliers
    for i in range(2):  # the first loop is with outliers
        if i == 1:  # the second one is without them
            dataset = f.remove_outliers(dataset)  # outliers removal

        X = dataset.drop(
            columns=['median_house_value']).to_numpy()  # data matrix
        y = dataset['median_house_value'].to_numpy()  # labels vector

        # add ones column to X
        ones = np.ones((X.shape[0], 1))
        X = np.hstack((ones, X))

        if cv_type == 'k-folds':
            # k-folds cross validation
            print('K-FOLDS CROSS VALIDATION')
            k = 10  # number of folds

            scores.append(f.k_foldsCV(X, y, k,
예제 #4
0
        weather = join_meta_data(weather, meta_data, chosen_building)

    # include the building ID for joining dataframes later
    weather['building_id'] = [chosen_building] * weather.shape[0]
    weather = weather.reset_index().set_index(keys = ["timestamp", "building_id"])
    dataframe_list.append(weather)

weather_dataframe = pd.concat(dataframe_list)

print("\nBUILDING TRAINING DATA")
print("Reading dataset...")
data = read_timeseries_data(f"{raw_folder}train.csv")
print("Processing dataset...")

# process outliers of the whole dataset
data, q_high, q_low = remove_outliers(data, data_retention=0.999)
print(f"Outlier limits: {q_low}, {q_high}")

dataframe_list= []

for chosen_building in range(0, meta_data.shape[0]):
    if chosen_building%50==0:
        print(f"We're on building #{chosen_building}...")

    chosen_site = meta_data.loc[meta_data.building_id == chosen_building, "site_id"].values[0]
    # removing sites as identified in data exploration
    if chosen_site is 7 or chosen_site == 9:
        continue

    building = data.loc[data.building_id == chosen_building].copy()
    building = electricity_conversion(building)
예제 #5
0
count = 0
plot_filenames = []

onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f))]
Nfiles = np.shape(onlyfiles)[0] # number of files (time steps)

Nfiles = 20
Noffset = 10
Nfiles0 = Nfiles

for j in range(Noffset,Nfiles+Noffset):
   my_file = data_path + '/' + onlyfiles[j] 
   print('file =', my_file)
   [N2j, SAj, CTj, epsj, zj] = fn.get_hydro(my_file,count)
   [N2j, SAj, CTj, epsj, zj] = fn.nanrid( N2j, SAj, CTj, epsj, zj )
   [N2j, SAj, CTj, epsj, zj] = fn.remove_outliers( N2j, SAj, CTj, epsj, zj )
   [N2j, SAj, CTj, epsj, zj] = fn.throw_points_in_z( N2j, SAj, CTj, epsj, zj , -2500.)
   N2=np.concatenate((N2,N2j),axis=0)
   SA=np.concatenate((SA,SAj),axis=0)
   CT=np.concatenate((CT,CTj),axis=0)
   eps=np.concatenate((eps,epsj),axis=0)
   z=np.concatenate((z,zj),axis=0)
   count = count + 1
   #plot_filenames = np.append(plot_filenames,my_file)

#fn.pdf_plot( N2, SA, CT, eps, z )
NSAMPLE = np.shape(N2)[0]


# =============================================================================
# plot training data
예제 #6
0
    columns = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'quality'
    ]

    feature_cols = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol'
    ]

    # 2.2. Preparing data
    df_wines = dict()
    df_wines["remove_outliers"] = remove_outliers(data)
    df_wines["replace_outliers"] = change_outliers_by_median(data)

    plot_dataframe_columns(data, "Box plots by column with raw data")
    plot_dataframe_columns(df_wines["remove_outliers"],
                           "Box plots when we remove the outliers")
    plot_dataframe_columns(
        df_wines["replace_outliers"],
        "Box plots when we replace the outliers by the median")

    plot_scatter_matrix(df_wines["remove_outliers"])
    plot_scatter_matrix(df_wines["replace_outliers"])

    correlation1 = correlation_table(df_wines["remove_outliers"])
    correlation2 = correlation_table(df_wines["replace_outliers"])