def var_corr_significant(path): filelist = preprocess.get_filelist(path) filelist = np.sort(filelist) corr_criterion = 0.5 log_file = open( "result/corr_log_{0}_{1}.txt".format( path.split('/')[-2], corr_criterion), "w") print( "********************** file correlation large than {} ***************" .format(corr_criterion), file=log_file) for file in filelist: print( "\n\n*************************** file: {} **********************". format(file), file=log_file) data = pd.read_csv(path + file, index_col=0) names = data.columns for col_name in names[1::]: series = data[col_name] filter = series.abs() > corr_criterion print("--------------- {} ------------------".format(col_name), file=log_file) for index, value in zip(names[filter], series[filter]): print("{0}\t\t{1}".format(index, value), file=log_file)
def select_spercific(): path = "result/corr_interpolation_10s_int/" filelist = preprocess.get_filelist(path) filelist = np.sort(filelist) file_log = open("result/corr_zw.txt", "w") for file in filelist: print( "**************** result of file: {} **************".format(file), file=file_log) data = pd.read_csv(path + file, index_col=0) for i in np.arange(1, 7): for j in np.arange(1, 7): name = 'ZX_WD_{0}_{1}'.format(i, j) print("-------- {} --------------".format(name), file=file_log) names_corr = list() for k in np.arange(1, 7): names_corr.insert(len(names_corr), 'ZX_WD_{0}_{1}'.format(i, k)) for k in np.arange(1, 7): if k == i: continue names_corr.insert(len(names_corr), 'ZX_WD_{0}_{1}'.format(k, j)) for index, value in zip(names_corr, data.loc[names_corr, name]): print("{0}\t\t{1}".format(index, value), file=file_log) print( "***********************************************************************\n\n", file=file_log)
def cal_correlation(path_in, path_out): filelist = preprocess.get_filelist(path_in) for file in filelist: print(file) data = pd.read_csv(path_in + file, index_col=0) result = data.corr() result.to_csv(path_out + file)
def pca_dataset_plot(path): def pd_plot(data): ax = data_plot.plot(figsize=(13, 2)) ax.legend(loc='best', prop={'size': 6}) filelist = preprocess.get_filelist(path) for file in filelist: print("---------------{}----------".format(file)) data = pd.read_csv(path + file) # other data_plot = data[static_variable.name_pca_other] axs = data_plot.plot(subplots=True, layout=(np.int(data_plot.shape[1] / 2), 2), figsize=(15, 8), style='o') # HW1 data_plot = data[static_variable.name_HW1] pd_plot(data_plot) # HW2 data_plot = data[static_variable.name_HW2] pd_plot(data_plot) # ZW data_plot = data[static_variable.name_pca_ZWNo1_1] pd_plot(data_plot) data_plot = data[static_variable.name_pca_ZWNo1_2] pd_plot(data_plot) data_plot = data[static_variable.name_pca_ZWNo1_3] pd_plot(data_plot) data_plot = data[static_variable.name_pca_ZWNo1_4] pd_plot(data_plot) data_plot = data[static_variable.name_pca_ZWNo1_5] pd_plot(data_plot) data_plot = data[static_variable.name_pca_ZWNo1_6] pd_plot(data_plot)
def cal_rolling_path(path_in, path_out, func, win): filist = preprocess.get_filelist(path_in) for file in filist: print("caculate for file {}".format(file)) cal_rolling_stats(file_in=path_in + file, file_out=path_out + file, func=func, win=win)
def pca_transformed_main(): path_in = "data/interpolation_10s_int/" path_out = "data/pca1/" file_log = open(path_out + "log.txt", "w") filelist = preprocess.get_filelist(path_in) for file in filelist: print("------------- start to process {} ------------".format(file), file=file_log) data = pd.read_csv(path_in + file) data_new = pd.DataFrame() # other variables for name in name_Others: data_new[name] = data[name] # ZX_HW1 and ZX_HW2 print("ZX_HW:", file=file_log) for i in [1, 2]: names = list() for j in np.arange(1, 7): name = "ZX_HW{0}_{1}".format(i, j) names.insert(len(names), name) ratio, value_transformed = pca_transformed(data[names]) print(ratio, file=file_log) for k in np.arange(len(names)): data_new[names[k]] = value_transformed[:, k] # ZX_WD_No1: ZX_WD_1_* print("ZX_WD_No1: ", file=file_log) name_prefix_no1 = "No1_" for i in np.arange(1, 7): names = list() for j in np.arange(1, 7): name = "ZX_WD_{0}_{1}".format(i, j) names.insert(len(names), name) ratio, value_transformed = pca_transformed(data[names]) print(ratio, file=file_log) for k in np.arange(len(names)): data_new[name_prefix_no1 + names[k]] = value_transformed[:, k] # ZX_WD_No2: ZX_WD_ * _1 name_prefix_no2 = "No2_" print("ZX_WD_No2: ", file=file_log) for i in np.arange(1, 7): names = list() for j in np.arange(1, 7): name = "ZX_WD_{0}_{1}".format(j, i) names.insert(len(names), name) ratio, value_transformed = pca_transformed(data[names]) print(ratio, file=file_log) for k in np.arange(len(names)): data_new[name_prefix_no2 + names[k]] = value_transformed[:, k] data_new.to_csv(path_out + file, index=False) print("---------------- end ----------------------------", file=file_log)
def plot_ZH_HW_box_all(path): filelist = preprocess.get_filelist(path) name = [ 'ZX_HW1_1', 'ZX_HW1_2', 'ZX_HW1_3', 'ZX_HW1_4', 'ZX_HW1_5', 'ZX_HW1_6', 'ZX_HW2_1', 'ZX_HW2_2', 'ZX_HW2_3', 'ZX_HW2_4', 'ZX_HW2_5', 'ZX_HW2_6' ] for file in filelist: plt.figure(figsize=(13, 2)) file_path = path + file data = pd.read_csv(file_path) plt.boxplot(data[name].as_matrix()) plt.xticks(np.arange(1, 1 + len(name)), name) plt.title(file)
def plot_ZH_HW_stats_all(path, func): plt.figure(figsize=(15, 5)) name = [ 'ZX_HW1_1', 'ZX_HW1_2', 'ZX_HW1_3', 'ZX_HW1_4', 'ZX_HW1_5', 'ZX_HW1_6', 'ZX_HW2_1', 'ZX_HW2_2', 'ZX_HW2_3', 'ZX_HW2_4', 'ZX_HW2_5', 'ZX_HW2_6' ] filelist = preprocess.get_filelist(path) filelist = np.sort(filelist) for file in filelist: file_path = path + file data = pd.read_csv(file_path) stds = data[name].apply(func) plt.plot(stds.values, 'o-', label=file) plt.xticks(np.arange(stds.count()), stds.index) plt.legend(loc='best', prop={'size': 8}) plt.title("std of ZH_HW")
train_data_dir = 'train' # validation directory path val_data_dir = 'val' # Dirs train_data_clean_dir = os.path.join(data_path, train_data_dir, 'clean') train_data_noisy_dir = os.path.join(data_path, train_data_dir, 'noisy') val_data_clean_dir = os.path.join(data_path, val_data_dir, 'clean') val_data_noisy_dir = os.path.join(data_path, val_data_dir, 'noisy') # print(train_data_clean_dir, train_data_noisy_dir, val_data_clean_dir, val_data_noisy_dir, sep='\n') # Preprocessing data print('Start preprocessing data (may take a few minutes)...') train_data_clean = list( map(lambda x: (preprocess.load_and_convert(x), 0), preprocess.get_filelist(train_data_clean_dir))) train_data_noisy = list( map(lambda x: (preprocess.load_and_convert(x), 1), preprocess.get_filelist(train_data_noisy_dir))) val_data_clean = list( map(lambda x: (preprocess.load_and_convert(x), 0), preprocess.get_filelist(val_data_clean_dir))) val_data_noisy = list( map(lambda x: (preprocess.load_and_convert(x), 1), preprocess.get_filelist(val_data_noisy_dir))) print( f'Train data: clean - {len(train_data_clean)}, noisy - {len(train_data_noisy)}\n' f'Val data: clean - {len(val_data_clean)} , noisy - {len(val_data_noisy)}') # Make train and val datasets