def main(): # datasets-related info task_path_list = glob.glob(os.path.join(datasets_path, 'raw/*')) task_name_list = [task_path.split('/')[-1] for task_path in task_path_list] # load raw datasets datasets_raw = [] for task_path in task_path_list: task_csv_path = os.path.join(task_path, 'csv') print('Loading data from: ' + task_csv_path) demo_path_list = glob.glob(os.path.join( task_csv_path, '201*')) # the prefix of dataset file demo_temp = [] for demo_path in demo_path_list: data_csv = pd.read_csv( os.path.join(demo_path, 'multiModal_states.csv')) # the file name of csv demo_temp.append({ 'stamp': (data_csv.values[:, 2].astype(int) - data_csv.values[0, 2]) * 1e-9, 'left_hand': np.hstack([ data_csv.values[:, 207:210].astype( float), # human left hand position data_csv.values[:, 7:15].astype(float), # emg ]), 'left_joints': data_csv.values[:, 317:324].astype(float) # robot ee actually }) datasets_raw.append(demo_temp) # filter the datasets: gaussian_filter1d datasets_filtered = [] for task_idx, task_data in enumerate(datasets_raw): print('Filtering data of task: ' + task_name_list[task_idx]) demo_norm_temp = [] for demo_data in task_data: time_stamp = demo_data['stamp'] # filter the datasets left_hand_filtered = gaussian_filter1d(demo_data['left_hand'].T, sigma=sigma).T left_joints_filtered = gaussian_filter1d( demo_data['left_joints'].T, sigma=sigma).T # append them to list demo_norm_temp.append({ 'alpha': time_stamp[-1], 'left_hand': left_hand_filtered, 'left_joints': left_joints_filtered }) datasets_filtered.append(demo_norm_temp) # resample the datasets datasets_norm = [] for task_idx, task_data in enumerate(datasets_raw): print('Resampling data of task: ' + task_name_list[task_idx]) demo_norm_temp = [] for demo_data in task_data: time_stamp = demo_data['stamp'] grid = np.linspace(0, time_stamp[-1], len_norm) # filter the datasets left_hand_filtered = gaussian_filter1d(demo_data['left_hand'].T, sigma=sigma).T left_joints_filtered = gaussian_filter1d( demo_data['left_joints'].T, sigma=sigma).T # normalize the datasets left_hand_norm = griddata(time_stamp, left_hand_filtered, grid, method='linear') left_joints_norm = griddata(time_stamp, left_joints_filtered, grid, method='linear') # append them to list demo_norm_temp.append({ 'alpha': time_stamp[-1], 'left_hand': left_hand_norm, 'left_joints': left_joints_norm }) datasets_norm.append(demo_norm_temp) # preprocessing for the norm data datasets4train = [] for task_idx, demo_list in enumerate(data_index): data = [datasets_norm[task_idx][i] for i in demo_list] datasets4train.append(data) y_full = np.array([]).reshape(0, num_joints) for task_idx, task_data in enumerate(datasets4train): print('Preprocessing data for task: ' + task_name_list[task_idx]) for demo_data in task_data: h = np.hstack([demo_data['left_hand'], demo_data['left_joints']]) y_full = np.vstack([y_full, h]) min_max_scaler = preprocessing.MinMaxScaler() datasets_norm_full = min_max_scaler.fit_transform(y_full) # construct a data structure to train the model datasets_norm_preproc = [] for task_idx in range(len(datasets4train)): datasets_temp = [] for demo_idx in range(num_demo): temp = datasets_norm_full[ (task_idx * num_demo + demo_idx) * len_norm:(task_idx * num_demo + demo_idx) * len_norm + len_norm, :] datasets_temp.append({ 'left_hand': temp[:, 0:11], 'left_joints': temp[:, 11:18], 'alpha': datasets4train[task_idx][demo_idx]['alpha'] }) datasets_norm_preproc.append(datasets_temp) # save all the datasets print('Saving the datasets as pkl ...') joblib.dump(task_name_list, os.path.join(datasets_path, 'pkl/task_name_list.pkl')) joblib.dump(datasets_raw, os.path.join(datasets_path, 'pkl/datasets_raw.pkl')) joblib.dump(datasets_filtered, os.path.join(datasets_path, 'pkl/datasets_filtered.pkl')) joblib.dump(datasets_norm, os.path.join(datasets_path, 'pkl/datasets_norm.pkl')) joblib.dump(datasets_norm_preproc, os.path.join(datasets_path, 'pkl/datasets_norm_preproc.pkl')) joblib.dump(min_max_scaler, os.path.join(datasets_path, 'pkl/min_max_scaler.pkl')) # the finished reminder print( 'Loaded, filtered, normalized, preprocessed and saved the datasets successfully!!!' )
header=0) # put each unscaled dataset in a dataframe array2 = dataframe2.values array3 = dataframe3.values #set the x-values for training set and validation set X1 = array2[:, 0:5] X2 = array3[:, 0:5] # set the y-values for training set and validatio set Y1 = array2[:, -1] Y2 = array3[:, -1] # set and scale using MinMaxscaler alogorithm scaler = preprocessing.MinMaxScaler().fit(X1) scaler = preprocessing.MinMaxScaler().fit(X2) # initializing the scaled X values (we assigned the scaled values to arbitrarily name variables) rescaledX1 = scaler.fit_transform(X1) rescaledX2 = scaler.fit_transform(X2) # We merge the y values with their respective scaled X values Z1 = numpy.append(rescaledX1, Y1[:, None], axis=1) Z2 = numpy.append(rescaledX2, Y2[:, None], axis=1) # we save the scaled datasets to a desired location numpy.savetxt("C:/Users/Kanverse/Documents/train1_scaled.csv", Z1, delimiter=",") numpy.savetxt("C:/Users/Kanverse/Documents/validation1_scaled.csv",
from sklearn import model_selection from sklearn import preprocessing from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score dataset = load_iris() #导入数据 x_data, y_data = dataset.data, dataset.target.reshape(-1, 1) print(x_data.shape) print(y_data.shape) #分离测试集和训练集 x_train, x_test, y_train, y_test = model_selection.train_test_split( x_data, y_data, random_state=0, test_size=0.25) scaler = preprocessing.MinMaxScaler() #均衡数据,加速收敛 x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) model = KNeighborsClassifier() model.fit(x_train, y_train) y_predict = model.predict(x_test) #输出对模型的评分 print(r2_score(y_test, y_predict))
for it in range(n_kernels): name = methods_name[it] print(name) f1 = np.loadtxt('D:/Study/Bioinformatics/补实验/AFP/feature_matrix/' + name_ds + '/' + name + '/train_' + name + '.csv', delimiter=',', skiprows=1) f3 = np.loadtxt('D:/Study/Bioinformatics/补实验/AFP/feature_matrix/' + name_ds + '/' + name + '/test_' + name + '.csv', delimiter=',', skiprows=1) X_train = get_feature(f1) X_test = get_feature(f3) scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) gram_train = metrics.pairwise.rbf_kernel(X_train, X_train, gamma=G_list[it]) gram_test = metrics.pairwise.rbf_kernel(X_test, X_train, gamma=G_list[it]) kernel_train_list.append(gram_train) kernel_test_list.append(gram_test) for i in range(n_kernels): gram_train += kernel_train_list[i] * weight_v[i] gram_test += kernel_test_list[i] * weight_v[i]
import numpy as np import time from sklearn import preprocessing from sklearn.feature_selection import VarianceThreshold from sklearn.metrics import silhouette_score from sklearn.cluster import KMeans from Test3.S_Dbw import S_Dbw # 实验3 第(3)步骤1 使用KMeans计算 mobilePath = "../实验数据/移动客户数据表.tsv" np.set_printoptions(precision=2, suppress=True) min_max_scaler = preprocessing.MinMaxScaler() x_feature = min_max_scaler.fit_transform( np.genfromtxt(mobilePath, skip_header=1, delimiter='\t')[:, 4:]) selector = VarianceThreshold(0) selector.fit(x_feature) arr = np.argsort(-selector.variances_) row_tag = np.genfromtxt(mobilePath, max_rows=1, dtype=str, delimiter='\t', usecols=arr[:20]) x_feature = min_max_scaler.fit_transform( np.genfromtxt(mobilePath, skip_header=1, delimiter='\t', usecols=arr[:20])) time_start = time.time() clf = KMeans(n_clusters=10) clf.fit(x_feature) print('聚类质量SSE:', clf.inertia_) time_end = time.time() print('聚类运算时间 {:.2f}'.format(time_end - time_start), 's')
class TrainingInstance: scaler = preprocessing.MinMaxScaler() def __init__(self, label, emg, acc, gyr, ori, emgts=None, accts=None, gyrts=None, orits=None): self.m_label = label # raw data self.emg = emg self.acc = acc self.gyr = gyr self.ori = ori # time stamps self.emgts = emgts self.accts = accts self.gyrts = gyrts self.orits = orits self.sr_emg = 200 self.sr_other = 50 # splitted flag self.splitted = False self.consolidated = False self.consolidatedFeatures = False def separateRawData(self): if self.emg is not None: self.emgList = np.array( [np.array(self.emg[:, 0]), np.array(self.emg[:, 1]), np.array(self.emg[:, 2]), np.array(self.emg[:, 3]), np.array(self.emg[:, 4]), np.array(self.emg[:, 5]), np.array(self.emg[:, 6]), np.array(self.emg[:, 7])]) if self.acc is not None: self.accList = np.array( [np.array(self.acc[:, 0]), np.array(self.acc[:, 1]), np.array(self.acc[:, 2])]) if self.gyr is not None: self.gyrList = np.array( [np.array(self.gyr[:, 0]), np.array(self.gyr[:, 1]), np.array(self.gyr[:, 2])]) if self.ori is not None: self.oriList = np.array( [np.array(self.ori[:, 0]), np.array(self.ori[:, 1]), np.array(self.ori[:, 2]), np.array(self.ori[:, 3])]) self.splitted = True # scale data def scaleData(self, scaler): if self.splitted == True: norm_emgs = [] norm_accs = [] norm_gyrs = [] norm_oris = [] for x in self.emgList: x = x.reshape(-1, 1) x = scaler.fit_transform(x) reshaped = x.reshape(x.shape[0]) norm_emgs.append(reshaped) for a, b in zip(self.accList, self.gyrList): a = a.reshape(-1, 1) a = scaler.fit_transform(a) reshaped_a = a.reshape(a.shape[0]) norm_accs.append(reshaped_a) b = b.reshape(-1, 1) b = scaler.fit_transform(b) reshaped_b = b.reshape(a.shape[0]) norm_gyrs.append(reshaped_b) for x in self.oriList: x = x.reshape(-1, 1) x = scaler.fit_transform(x) reshaped = x.reshape(x.shape[0]) norm_oris.append(reshaped) self.emgList = np.array(norm_emgs) self.accList = np.array(norm_accs) self.gyrList = np.array(norm_gyrs) self.oriList = np.array(norm_oris) return self # normalize data to common length def normalizeData(self, max_len_emg, max_len_others): if self.splitted == True: norm_emgs = [] norm_accs = [] norm_gyrs = [] norm_oris = [] for x in self.emgList: if (x.shape[0] == max_len_emg): norm_emgs.append(x) continue if (x.shape[0] < max_len_emg): half = (float(max_len_emg - x.shape[0])) / 2 back = ceil(half) front = floor(half) norm_emgs.append(util.padVector(x, front, back, True)) for a, b in zip(self.accList, self.gyrList): if (a.shape == max_len_others): norm_accs.append(a) norm_gyrs.append(b) continue if (a.shape[0] < max_len_others): half_a = (float(max_len_others - a.shape[0])) / 2 back_a = ceil(half_a) front_a = floor(half_a) half_b = (float(max_len_others - b.shape[0])) / 2 back_b = ceil(half_b) front_b = floor(half_b) norm_accs.append(util.padVector(a, front_a, back_a)) norm_gyrs.append(util.padVector(b, front_b, back_b)) for x in self.oriList: if (x.shape[0] == max_len_others): norm_oris.append(x) continue if (x.shape[0] < max_len_others): half = (float(max_len_others - x.shape[0])) / 2 back = ceil(half) front = floor(half) norm_oris.append(util.padVector(x, front, back)) ''' # Four axes, returned as a 2-d array f, axarr = plt.subplots(2, 2) axarr[0, 0].plot(np.arange(len(self.emgList[0])),self.emgList[0]) axarr[0, 0].set_title('Raw EMG') axarr[0, 1].plot(np.arange(len(norm_emgs[0])),norm_emgs[0]) axarr[0, 1].set_title('Normalized Emg') axarr[1, 0].plot(np.arange(len(self.accList[1])),self.accList[1]) axarr[1, 0].set_title('Raw ACC X') axarr[1, 1].plot(np.arange(len(norm_accs[1])),norm_accs[1]) axarr[1, 1].set_title('Normalized ACC X') plt.show() ''' self.emgList = np.array(norm_emgs) self.accList = np.array(norm_accs) self.gyrList = np.array(norm_gyrs) self.oriList = np.array(norm_oris) return self def resampleData(self, sr, avg_len, emg=True, imu=True): ''' Method for resampling the all the signals and bringing them to the same sampling rate :param sr: (int): sampling rate :return: self with all resampled data ''' if self.splitted == True: # Calculate the new length of vectors given the new sampling # frequency/rate sample_len_emg = int((sr * self.emgList[0].size) / self.sr_emg) sample_len_emg_others = int( (sr * self.accList[0].size) / self.sr_other) self.sr_emg = sr self.sr_other = sr ''' self.emgList_r = self.emgList self.accList_r = self.accList self.gyrList_r = self.gyrList self.oriList_r = self.oriList ''' # resampling the normalized data self.emgList = np.array( [signal.resample(x, sample_len_emg) for x in self.emgList]) self.accList = np.array( [signal.resample(x, sample_len_emg_others) for x in self.accList]) self.gyrList = np.array( [signal.resample(x, sample_len_emg_others) for x in self.gyrList]) self.oriList = np.array( [signal.resample(x, sample_len_emg_others) for x in self.oriList]) self.consolidateData(avg_len, emg, imu) return self def extractFeatures(self, window=True, scaler=None, rms=False, f_mfcc=False, emg=True, imu=True): ''' This method extracts features from the training instance and consolidates into one meature matrix according to the parameters provided :param window: (Boolean) : To get overlapping windowed features :param scaler: (Scaler Object as in scikit-learn) : Scalar object to scale the features :param rms: (Boolean) : To extract features from the Root Mean Square of the signals in all dimensions :param f_mfcc: (Boolean) : To extract MFCC features :param emg: (Boolean) : To extract features from EMG signals :param imu: (Boolean) : To extract features from IMU signals :return: self ''' # print(self.m_label) if self.splitted == True: # For RMS if rms: all_emg = zip(self.emgList[0], self.emgList[1], self.emgList[2], self.emgList[3], self.emgList[4], self.emgList[5], self.emgList[6], self.emgList[7]) all_acc = zip(self.accList[0], self.accList[1], self.accList[2]) all_gyr = zip(self.gyrList[0], self.gyrList[1], self.gyrList[2]) all_ori = zip(self.oriList[0], self.oriList[1], self.oriList[2], self.oriList[3]) rms_emg = [] rms_acc = [] rms_gyr = [] rms_ori = [] # calculating RMS for all the signals for _0, _1, _2, _3, _4, _5, _6, _7 in all_emg: vec = [_0, _1, _2, _3, _4, _5, _6, _7] rms_val = sqrt(sum(n * n for n in vec) / len(vec)) rms_emg.append(rms_val) for _0, _1, _2 in all_acc: vec = [_0, _1, _2] rms_val = sqrt(sum(n * n for n in vec) / len(vec)) rms_acc.append(rms_val) for _0, _1, _2 in all_gyr: vec = [_0, _1, _2] rms_val = sqrt(sum(n * n for n in vec) / len(vec)) rms_gyr.append(rms_val) for _0, _1, _2, _3 in all_ori: vec = [_0, _1, _2, _3] rms_val = sqrt(sum(n * n for n in vec) / len(vec)) rms_ori.append(rms_val) # Extracting features for all the signals self.emgRmsFeatures = fe.getFeatures(rms_emg, self.sr_emg, window, f_mfcc) self.accRmsFeatures = fe.getFeatures(rms_acc, self.sr_other, window, f_mfcc) self.gyrRmsFeatures = fe.getFeatures(rms_gyr, self.sr_other, window, f_mfcc) self.oriRmsFeatures = fe.getFeatures(rms_ori, self.sr_other, window, f_mfcc) # for extracting features from raw data else: self.emgFeatures = np.array( [fe.getFeatures(x, self.sr_emg, window, f_mfcc) for x in self.emgList]) self.accFeatures = np.array( [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in self.accList]) self.gyrFeatures = np.array( [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in self.gyrList]) self.oriFeatures = np.array( [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in self.oriList]) self.consolidateFeatures(scaler, rms, emg, imu) return self def consolidateFeatures(self, scaler=None, rms=False, emg=True, imu=True): ''' Method to consolidate the features of all the sensor data in all dimensions to a single feature matrix :param scaler: (Scaler Object) : A scaler object to scale the features :param rms: (Boolean) : Flag for consolidating RMS features :param emg: (Boolean) : Flas to consider features from EMG signals :param imu: (Boolean) : Flag to consider IMU Signals :return: consolidated_feature_Matrix (ndarray) : with columns as features and rows as overlapping window frames. If window was false then it just has one row. ''' if self.splitted == True: con_emg_feat = None con_acc_feat = None con_gyr_feat = None con_ori_feat = None consolidatedFeatureMatrix = None if rms: if emg: con_emg_feat = self.emgRmsFeatures if imu: con_acc_feat = self.accRmsFeatures con_gyr_feat = self.gyrRmsFeatures con_ori_feat = self.oriRmsFeatures else: if emg: n_emg_rows = self.emgFeatures[0].shape[0] n_emg_columns = self.emgFeatures[0].shape[1] new_n_emg_columns = self.emgFeatures.shape[ 0] * n_emg_columns if imu: n_acc_rows = self.accFeatures[0].shape[0] n_acc_columns = self.accFeatures[0].shape[1] new_n_acc_columns = self.accFeatures.shape[ 0] * n_acc_columns n_gyr_rows = self.gyrFeatures[0].shape[0] n_gyr_columns = self.gyrFeatures[0].shape[1] new_n_gyr_columns = self.gyrFeatures.shape[ 0] * n_gyr_columns n_ori_rows = self.oriFeatures[0].shape[0] n_ori_columns = self.oriFeatures[0].shape[1] new_n_ori_columns = self.oriFeatures.shape[ 0] * n_ori_columns if emg: con_emg_feat = np.reshape(self.emgFeatures, (n_emg_rows, new_n_emg_columns)) if imu: con_acc_feat = np.reshape(self.accFeatures, (n_acc_rows, new_n_acc_columns)) con_gyr_feat = np.reshape(self.gyrFeatures, (n_gyr_rows, new_n_gyr_columns)) con_ori_feat = np.reshape(self.oriFeatures, (n_ori_rows, new_n_ori_columns)) if emg and imu: consolidatedFeatureMatrix = np.concatenate( (con_emg_feat, con_acc_feat), axis=1) consolidatedFeatureMatrix = np.concatenate( (consolidatedFeatureMatrix, con_gyr_feat), axis=1) consolidatedFeatureMatrix = np.concatenate( (consolidatedFeatureMatrix, con_ori_feat), axis=1) elif emg and (not imu): consolidatedFeatureMatrix = con_emg_feat elif (not emg) and imu: consolidatedFeatureMatrix = con_acc_feat consolidatedFeatureMatrix = np.concatenate( (consolidatedFeatureMatrix, con_gyr_feat), axis=1) consolidatedFeatureMatrix = np.concatenate( (consolidatedFeatureMatrix, con_ori_feat), axis=1) else: return None ''' consolidatedFeatureMatrix = np.concatenate((con_emg_feat, con_acc_feat), axis=1) consolidatedFeatureMatrix = np.concatenate(( consolidatedFeatureMatrix, con_gyr_feat), axis=1) consolidatedFeatureMatrix = np.concatenate(( consolidatedFeatureMatrix, con_ori_feat), axis=1) ''' self.consolidatedFeatureMatrix = consolidatedFeatureMatrix self.consolidatedFeatures = True if scaler is not None: consolidatedFeatureMatrix = scaler.fit_transform( consolidatedFeatureMatrix) return consolidatedFeatureMatrix else: return None def consolidateData(self, avg_len, emg, imu): consolidatedDataMatrix = None if self.splitted == True: if emg and imu: emg_r = np.array( [signal.resample(x, avg_len) for x in self.emgList_r]) acc_r = np.array( [signal.resample(x, avg_len) for x in self.accList_r]) gyr_r = np.array( [signal.resample(x, avg_len) for x in self.gyrList_r]) ori_r = np.array( [signal.resample(x, avg_len) for x in self.oriList_r]) consolidatedDataMatrix = np.concatenate( (emg_r, acc_r, gyr_r, ori_r), axis=0) elif emg and (not imu): consolidatedDataMatrix = self.emgList elif (not emg) and imu: consolidatedDataMatrix = np.concatenate( (self.accList, self.gyrList, self.oriList), axis=0) else: emg_r = np.array( [signal.resample(x, avg_len) for x in self.emgList_r]) acc_r = np.array( [signal.resample(x, avg_len) for x in self.accList_r]) gyr_r = np.array( [signal.resample(x, avg_len) for x in self.gyrList_r]) ori_r = np.array( [signal.resample(x, avg_len) for x in self.oriList_r]) consolidatedDataMatrix = np.concatenate( (emg_r, acc_r, gyr_r, ori_r), axis=0) self.consolidatedDataMatrix = consolidatedDataMatrix.transpose() self.consolidated = True return consolidatedDataMatrix else: return None def getConsolidatedFeatureMatrix(self): if self.consolidatedFeatures: return self.consolidatedFeatureMatrix def getConsolidatedDataMatrix(self): if self.consolidated: return self.consolidatedDataMatrix def getRawData(self): return self.emg, self.acc, self.gyr, self.ori def getData(self): if self.splitted is True: return self.emg, self.acc, self.gyr, self.ori, self.emgList, \ self.accList, self.gyrList, self.oriList else: return self.emg, self.acc, self.gyr, self.ori def getIndevidualFeatures(self, meanNormalized=False): emg_0_feat = None emg_1_feat = None emg_2_feat = None emg_3_feat = None emg_4_feat = None emg_5_feat = None emg_6_feat = None emg_7_feat = None acc_x_feat = None acc_y_feat = None acc_z_feat = None gyr_x_feat = None gyr_y_feat = None gyr_z_feat = None ori_x_feat = None ori_y_feat = None ori_z_feat = None ori_w_feat = None if self.splitted and self.consolidatedFeatures: if meanNormalized: for i, feat in enumerate(self.emgFeatures): if i is 0: emg_0_feat = self.scaler.fit_transform(feat) emg_0_feat = np.insert(emg_0_feat, len(emg_0_feat[0]), self.m_label) elif i is 1: emg_1_feat = self.scaler.fit_transform(feat) emg_1_feat = np.insert(emg_1_feat, len(emg_1_feat[0]), self.m_label) elif i is 2: emg_2_feat = self.scaler.fit_transform(feat) emg_2_feat = np.insert(emg_2_feat, len(emg_2_feat[0]), self.m_label) elif i is 3: emg_3_feat = self.scaler.fit_transform(feat) emg_3_feat = np.insert(emg_3_feat, len(emg_3_feat[0]), self.m_label) elif i is 4: emg_4_feat = self.scaler.fit_transform(feat) emg_4_feat = np.insert(emg_4_feat, len(emg_4_feat[0]), self.m_label) elif i is 5: emg_5_feat = self.scaler.fit_transform(feat) emg_5_feat = np.insert(emg_5_feat, len(emg_5_feat[0]), self.m_label) elif i is 6: emg_6_feat = self.scaler.fit_transform(feat) emg_6_feat = np.insert(emg_6_feat, len(emg_6_feat[0]), self.m_label) elif i is 7: emg_7_feat = self.scaler.fit_transform(feat) emg_7_feat = np.insert(emg_7_feat, len(emg_7_feat[0]), self.m_label) for i, feat in enumerate(self.accFeatures): if i is 0: acc_x_feat = self.scaler.fit_transform(feat) acc_x_feat = np.insert(acc_x_feat, len(acc_x_feat[0]), self.m_label) elif i is 1: acc_y_feat = self.scaler.fit_transform(feat) acc_y_feat = np.insert(acc_y_feat, len(acc_y_feat[0]), self.m_label) elif i is 2: acc_z_feat = self.scaler.fit_transform(feat) acc_z_feat = np.insert(acc_z_feat, len(acc_z_feat[0]), self.m_label) for i, feat in enumerate(self.gyrFeatures): if i is 0: gyr_x_feat = self.scaler.fit_transform(feat) gyr_x_feat = np.insert(gyr_x_feat, len(gyr_x_feat[0]), self.m_label) elif i is 1: gyr_y_feat = self.scaler.fit_transform(feat) gyr_y_feat = np.insert(gyr_y_feat, len(gyr_y_feat[0]), self.m_label) elif i is 2: gyr_z_feat = self.scaler.fit_transform(feat) gyr_z_feat = np.insert(gyr_z_feat, len(gyr_z_feat[0]), self.m_label) for i, feat in enumerate(self.oriFeatures): if i is 0: ori_x_feat = self.scaler.fit_transform(feat) ori_x_feat = np.insert(ori_x_feat, len(ori_x_feat[0]), self.m_label) elif i is 1: ori_y_feat = self.scaler.fit_transform(feat) ori_y_feat = np.insert(ori_y_feat, len(ori_y_feat[0]), self.m_label) elif i is 2: ori_z_feat = self.scaler.fit_transform(feat) ori_z_feat = np.insert(ori_z_feat, len(ori_z_feat[0]), self.m_label) elif i is 3: ori_w_feat = self.scaler.fit_transform(feat) ori_w_feat = np.insert(ori_w_feat, len(ori_w_feat[0]), self.m_label) else: for i, feat in enumerate(self.emgFeatures): if i is 0: emg_0_feat = feat emg_0_feat = np.insert(emg_0_feat, len(emg_0_feat[0]), self.m_label) elif i is 1: emg_1_feat = feat emg_1_feat = np.insert(emg_1_feat, len(emg_1_feat[0]), self.m_label) elif i is 2: emg_2_feat = feat emg_2_feat = np.insert(emg_2_feat, len(emg_2_feat[0]), self.m_label) elif i is 3: emg_3_feat = feat emg_3_feat = np.insert(emg_3_feat, len(emg_3_feat[0]), self.m_label) elif i is 4: emg_4_feat = feat emg_4_feat = np.insert(emg_4_feat, len(emg_4_feat[0]), self.m_label) elif i is 5: emg_5_feat = feat emg_5_feat = np.insert(emg_5_feat, len(emg_5_feat[0]), self.m_label) elif i is 6: emg_6_feat = feat emg_6_feat = np.insert(emg_6_feat, len(emg_6_feat[0]), self.m_label) elif i is 7: emg_7_feat = feat emg_7_feat = np.insert(emg_7_feat, len(emg_7_feat[0]), self.m_label) for i, feat in enumerate(self.accFeatures): if i is 0: acc_x_feat = feat acc_x_feat = np.insert(acc_x_feat, len(acc_x_feat[0]), self.m_label) elif i is 1: acc_y_feat = feat acc_y_feat = np.insert(acc_y_feat, len(acc_y_feat[0]), self.m_label) elif i is 2: acc_z_feat = feat acc_z_feat = np.insert(acc_z_feat, len(acc_z_feat[0]), self.m_label) for i, feat in enumerate(self.gyrFeatures): if i is 0: gyr_x_feat = feat gyr_x_feat = np.insert(gyr_x_feat, len(gyr_x_feat[0]), self.m_label) elif i is 1: gyr_y_feat = feat gyr_y_feat = np.insert(gyr_y_feat, len(gyr_y_feat[0]), self.m_label) elif i is 2: gyr_z_feat = feat gyr_z_feat = np.insert(gyr_z_feat, len(gyr_z_feat[0]), self.m_label) for i, feat in enumerate(self.oriFeatures): if i is 0: ori_x_feat = feat ori_x_feat = np.insert(ori_x_feat, len(ori_x_feat[0]), self.m_label) elif i is 1: ori_y_feat = feat ori_y_feat = np.insert(ori_y_feat, len(ori_y_feat[0]), self.m_label) elif i is 2: ori_z_feat = feat ori_z_feat = np.insert(ori_z_feat, len(ori_z_feat[0]), self.m_label) elif i is 3: ori_w_feat = feat ori_w_feat = np.insert(ori_w_feat, len(ori_w_feat[0]), self.m_label) return emg_0_feat, emg_1_feat, emg_2_feat, emg_3_feat, emg_4_feat, emg_5_feat, emg_6_feat, emg_7_feat, acc_x_feat, acc_y_feat, acc_z_feat, gyr_x_feat, gyr_y_feat, gyr_z_feat, ori_x_feat, ori_y_feat, ori_z_feat, ori_w_feat else: return None
df_test = pd.read_csv(r'test_pub.csv') df = pd.read_csv(r'train.csv') df_onehot = pd.get_dummies(df) keys = df_onehot.keys() data_keys = [k for k in keys if '?' not in k and k[-3:] != "50K"] data_train = df_onehot[data_keys] target_train = df_onehot["Salary_ >50K"] df_onehot1 = pd.get_dummies(df_test) # add all zero to non-existing keys for k in data_keys: if k not in df_onehot1.keys(): df_onehot1[k] = 0 data_test = df_onehot1[data_keys] sc = prep.MinMaxScaler() data_train_s = sc.fit_transform(data_train) data_test_s = sc.transform(data_test) lr = LogisticRegression() lr.fit(data_train_s, target_train) # Predict the probability of positive class pred_test_prob = lr.predict_proba(data_test_s)[:, 1] # df_test["Predicted"] = pred_test_prob df_test[["ID", "Predicted"]].to_csv("LogisticReg_v0.csv", index=False)
age_mean = df['age'].mean() #average null age df['age'] = df['age'].fillna(age_mean) #fill the null fare_mean = df['fare'].mean() #average null fare df['fare'] = df['fare'].fillna(fare_mean) #fill the null df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int) x_OneHot_df = pd.get_dummies(data=df, columns=["embarked" ]) #embarked classification convert ndarray = x_OneHot_df.values #dataframe convert array Label = ndarray[:0] #:=all 0=number 0 data field Features = ndarray[:, 1:] #:=all 1:=number 1 data field to the last #preprocessing from sklearn import preprocessing minmax_scale = preprocessing.MinMaxScaler(feature_range=( 0, 1)) #preprocessing.MinMaxScalerSet is preprocessing Min and Max #feature range between 0 and 1 scaledFeatures = minmax_scale.fit_transform( Features) #import Features to minmax_scale.fit_transform to preprocessing msk = numpy.random.rand(len(all_df)) < 0.8 #8:2 to msk train_df = all_df[msk] #train 80% test_df = all_df[~msk] #test 20% ''' def PreprocessData(raw_df): df=raw_df.drop(['name'], axis=1) age_mean = df['age'].mean() df['age'] = df['age'].fillna(age_mean) fare_mean = df['fare'].mean() df['fare'] = df['fare'].fillna(fare_mean) df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int) x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])
def min_max_PandasNorm(df): x = df.values min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) x_norm = min_max_scaler.fit_transform(x) return pd.DataFrame(x_norm)
def get_training_data(X_columns, categorical, floating, integer, pos_data, neg_data, sample_size=0.1, seed=9527): #sythesize a training dataset from positive and negative data #positive dataset is smaller pos_clear = pos_data[pos_data['WeatherCond'] == 'Clear'] pos_other = pos_data[pos_data['WeatherCond'] != 'Clear'] #16000 was chosen temporarily because about 16000 crashes are associated with rainy weather, #and this is the about the same order of magnitutde compared to other weather conditions pos_clear_sub = pos_clear.sample(16000, random_state=seed) pos_data = pd.concat([pos_clear_sub, pos_other]) sample_size = int(min(neg_data.shape[0], pos_data.shape[0]) * sample_size) pos_data['Crash'] = [1 for i in xrange(len(pos_data))] neg_data['Crash'] = [0 for i in xrange(len(neg_data))] columns = X_columns + ['Crash'] data_df_pos = pos_data.sample(sample_size, random_state=seed) data_df_neg = neg_data.sample(sample_size, random_state=seed) data_df = pd.concat([data_df_pos[columns], data_df_neg[columns]]) data_df[categorical] = data_df[categorical].astype(str) data_df[floating] = data_df[floating].astype('float64') data_df[integer] = data_df[integer].astype('int64') data_df_catagorical = data_df.select_dtypes(exclude=['float64', 'int64']) data_df_numerical = data_df.select_dtypes(include=['float64', 'int64']) # TODO one hot encode the catagoricals and start training a model ohe = preprocessing.OneHotEncoder(sparse=False) d = defaultdict(preprocessing.LabelEncoder) data_df_labelenc = data_df_catagorical.apply( lambda x: d[x.name].fit_transform(x)) # print data_df_catagorical # print data_df_labelenc.values x_ohe = ohe.fit_transform(data_df_labelenc.values) x_preprocessed = np.concatenate( (data_df_numerical.values[:, 0:data_df_numerical.shape[1] - 1], x_ohe), axis=1) # TODO don't scale before spliting into training and test set. Added parameter that changes if the return is (x,y) or (x_train,ytrain, xtest, y test) # TODO change this to MaxMin scalar to avoid distorting coordinate data with a normal distribution # sscaler = preprocessing.StandardScaler() sscaler = preprocessing.MinMaxScaler() sscaler.fit(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) x_preprocessed[:, 0:data_df_numerical.shape[1] - 1] = sscaler.transform( x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) y_preprocessed = data_df_numerical.values[:, -1] #or_x_preprocessed = sscaler.inverse_transform(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) #or_x_preprocessed = pd.DataFrame(or_x_preprocessed) #n_c=data_df_numerical.columns.values.tolist() #or_x_preprocessed.columns = n_c return x_preprocessed, y_preprocessed, ohe, d, sscaler
def read(filename): spectrogram = pd.read_csv(filename, sep =',') min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(spectrogram) arr2D = x_scaled return arr2D
def get_data_for_training_preprocessed(G, X_columns=None, Y_column=None, X_cat=None, X_int=None, X_float=None, multiple_x='newest', multiple_y='newest', verbose=False): samples = [] X_c = X_columns for edge in G.edges_iter(data=True): data = edge[2] if Y_column in data: midpoint = ((edge[0][0] + edge[1][0]) / 2, (edge[0][1] + edge[1][1]) / 2) sample = [midpoint[0], midpoint[1]] for column in X_columns: try: if isinstance(data[column], list): if multiple_x == 'newest': sample.append(data[column][-1]) if multiple_x == 'sum': sample.append(sum(data[column])) if multiple_x == 'average': sample.append( sum(data[column]) / len(data[column])) else: sample.append(data[column]) except: sample.append(None) if isinstance(data[Y_column], list): if multiple_y == 'newest': sample.append(data[Y_column][-1]) if multiple_y == 'sum': sample.append(sum(data[Y_column])) if multiple_y == 'average': sample.append(sum(data[Y_column]) / len(data[Y_column])) else: sample.append(data[Y_column]) samples.append(sample) if verbose: print 'done creating model training data with ' + str( len(samples)) + " samples" data_df = pd.DataFrame(samples) col = ['X', 'Y'] col = col + X_c col.append('attribute') data_df.columns = col #data_df.to_csv('C:/Users/husiy/PyProgram/OPEN DATA NATION/Chicago_Test/test710.csv', index=False) cl = data_df.columns.get_values() # print cl det = [] # dl=xrange(len(cl)) for c in cl: # print sum(pd.notnull(data_df.iloc[:,c])) if sum(pd.notnull(data_df[c])) <= 0.8 * data_df.shape[0]: det.append(c) # dl.append(c-2) # for il in sorted(dl, reverse=True): # del X_c[il] # print det data_df = data_df.drop(det, 1) data_df = data_df.dropna() for dc in det: X_c.remove(dc) # print X_c # print 'dropna', data_df.shape # print data_df.head() # print data_df.dtypes # print data_df.head() if X_cat != None: data_df[X_cat] = data_df[X_cat].astype(str) if X_int != None: data_df[X_int] = data_df[X_int].astype('int64') if X_float != None: data_df[X_float] = data_df[X_float].astype('float64') print data_df.dtypes data_df_catagorical = data_df.select_dtypes(exclude=['float64', 'int64']) data_df_numerical = data_df.select_dtypes(include=['float64', 'int64']) # TODO one hot encode the catagoricals and start training a model if len(data_df_catagorical.columns) != 0: # TODO one hot encode the catagoricals and start training a model ohe = preprocessing.OneHotEncoder(sparse=False) d = defaultdict(preprocessing.LabelEncoder) data_df_labelenc = data_df_catagorical.apply( lambda x: d[x.name].fit_transform(x)) # print data_df_catagorical # print data_df_labelenc.values x_ohe = ohe.fit_transform(data_df_labelenc.values) x_preprocessed = np.concatenate( (data_df_numerical.values[:, 0:data_df_numerical.shape[1] - 1], x_ohe), axis=1) else: x_preprocessed = data_df_numerical.values[:, 0:data_df_numerical. shape[1] - 1] sscaler = preprocessing.MinMaxScaler() sscaler.fit(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) x_preprocessed[:, 0:data_df_numerical.shape[1] - 1] = sscaler.transform( x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) y_preprocessed = data_df_numerical.values[:, -1] or_x_preprocessed = sscaler.inverse_transform( x_preprocessed[:, 0:data_df_numerical.shape[1] - 1]) or_x_preprocessed = pd.DataFrame(or_x_preprocessed) n_c = data_df_numerical.columns.values.tolist() n_c.remove('attribute') or_x_preprocessed.columns = n_c # Saving transformations for later use try: OHE = ohe LabelEncoder = d except: pass return x_preprocessed, y_preprocessed, ohe, d, sscaler
def calibration_main(locator, config): # INITIALIZE TIMER t0 = time.clock() # Local variables building_name = config.single_calibration.building building_load = config.single_calibration.load iteration_pymc3 = config.single_calibration.iterations with open(locator.get_calibration_problem(building_name, building_load), 'r') as input_file: problem = pickle.load(input_file) emulator = joblib.load(locator.get_calibration_gaussian_emulator(building_name, building_load)) distributions = problem['probabiltiy_vars'] variables = problem['variables'] # Create function to call predictions (mu) @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector]) def predict_y(var1, var2, var3, var4, var5, var6): input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1) prediction = emulator.predict(input_sample) return prediction # Create function to call predictions (sigma) @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector]) def predict_sigma(var1, var2, var3, var4, var5, var6): input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1) _, sigma = emulator.predict(input_sample, return_std=True) return sigma with pymc3.Model() as basic_model: # DECLARE PRIORS for i, variable in enumerate(variables): arguments = np.array([distributions.loc[variable, 'min'], distributions.loc[variable, 'max'], distributions.loc[variable, 'mu']]).reshape(-1, 1) min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1)) arguments_norm = min_max_scaler.fit_transform(arguments) globals()['var' + str(i + 1)] = pymc3.Triangular('var' + str(i + 1), lower=arguments_norm[0][0], upper=arguments_norm[1][0], c=arguments_norm[2][0]) # DECLARE OBJECTIVE FUNCTION mu = pymc3.Deterministic('mu', predict_y(var1, var2, var3, var4, var5, var6)) sigma = pymc3.HalfNormal('sigma', 0.15) # sigma = pm.Deterministic('sigma', predict_sigma(var1, var2, var3, var4, var5, var6)) y_obs = pymc3.Normal('y_obs', mu=mu, sd=sigma, observed=0) # RUN MODEL, SAVE TO DISC AND PLOT RESULTS with basic_model: # Running step = pymc3.Metropolis() trace = pymc3.sample(iteration_pymc3, tune=1000, njobs=1, step=step) # Saving df_trace = pymc3.trace_to_dataframe(trace) #CREATE GRAPHS AND SAVE TO DISC df_trace.to_csv(locator.get_calibration_posteriors(building_name, building_load)) pymc3.traceplot(trace) columns = ["var1", "var2", "var3", "var4", "var5", "var6"] seaborn.pairplot(df_trace[columns]) if config.single_calibration.show_plots: plt.show() #SAVING POSTERIORS IN PROBLEM problem['posterior_norm'] = df_trace.as_matrix(columns=columns) pickle.dump(problem, open(locator.get_calibration_problem(building_name, building_load), 'w')) return
# In[16]: adata.raw = adata adata = adata[:, adata.var.highly_variable] data=adata.X # In[17]: mmscaler = preprocessing.MinMaxScaler() # In[18]: data = mmscaler.fit_transform(data) # In[19]: Xtarget_train, Xtarget_valid = train_test_split(data, test_size=valid_size, random_state=42) # In[20]:
url = 'C:\\Users\\Lenovo\\Desktop\\Sani\\andicatot\\data.txt' ######################### names = ['Date','Open','High','Low','Close','Volume','OpenInt'] dataset = pd.read_csv(url,names = names) dataset = dataset.drop(0,axis = 0) dataset = dataset.drop('Date',axis = 1) dataset = dataset.drop('OpenInt',axis = 1) for i in range(1,6): for k in range(1,3202): dataset[names[i]][k] = float(dataset[names[i]][k]) # len = 3201 # learn = [0:2731] # test = [2731:3201] data_normaliser = preprocessing.MinMaxScaler() dataset = data_normaliser.fit_transform(dataset) deltap = [] for i in range(1,3201): deltap.append(dataset[i-1][0] - dataset[i][0]) deltat = deltap[:2731] for i in range(2731,3200): deltat.append(deltap[i-1]-deltap[i]) plt.plot(deltap,color = 'red') plt.plot(deltat,color = 'green') style = plt.gcf() style.set_size_inches(12,10) plt.show()
data.features = data[["text"]] df = pd.DataFrame(data.features) data.features = data["text"].apply(lambda x: remove_puncs(x)) data.features = sent_tokenize(str(data.features)) data.features = word_tokenize(str(data.features)) #data.features=[word for word in data.features if word.isalpha()] #data.features=nltk.word_tokenize(data.features) #data.features=data.features.apply(lambda x: ' '.join([word for word in x if word not in stopwords.words()])) #df['text']=pd.to_numeric(df["text"],errors="coerce") #data.features=data["text"].apply(lambda x:remove_stopwords(x)) data.target = data.Label #print(dtypes) #data.features = SimpleImputer(missing_values=np.nan, strategy='mean') print(data.features) data.features = pd.get_dummies(data["text"]) data.features = preprocessing.MinMaxScaler().fit_transform(data.features) feature_train, feature_test, target_train, target_test = train_test_split( data.features, data.target, test_size=0.25) model = KNeighborsClassifier(n_neighbors=52) fittedModel = model.fit(feature_train, target_train) predictions = fittedModel.predict(feature_test) predTrain = fittedModel.predict(feature_train) print("Test:-", accuracy_score(target_test, predictions)) print("Training:-", accuracy_score(target_train, predTrain)) print(feature_train)
training_data.append(get_tuple(results, i, n)) training_labels.append(results["y"].values[i]) def map_range(x, in_min, in_max, out_min, out_max): return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min training_data_sigmoid = training_data.copy() for data_point in training_data: for x in data_point: x = x training_labels_sigmoid = training_labels.copy() min_max_scaler = preprocessing.MinMaxScaler() training_labels_sigmoid = min_max_scaler.fit_transform( results[['y']].values.astype(float)) preceptron_sigmoid.train(training_data_sigmoid, training_labels_sigmoid, epochs=10) print("Trained Weights (sigmoid):", preceptron_sigmoid._weights) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(1, 3)) plt.plot( results["x"].values, min_max_scaler.fit_transform( np.array([
plt.rcParams['font.size'] = 15 plt.rcParams['font.family'] = 'Times New Roman' from math import sqrt from sklearn.metrics import mean_squared_error np.random.seed(1337) # for reproducibility import warnings warnings.filterwarnings('ignore') data_dim = 4 timesteps = 6 out_dim = 6 dataset = pd.read_csv('multistep_feature.csv', header=None) min_max_scaler_input = preprocessing.MinMaxScaler() #输入标准化函数 min_max_scaler_output = preprocessing.MinMaxScaler() #输出标准化函数 data_input = dataset.iloc[:, :24].values #输入数据 data_output = dataset.iloc[:, 24:].values #输出数据 trainlen = int(len(data_input) * 0.8) #输入样本数 testlen = int(len(data_input) - trainlen) #测试样本数 train_output = data_output[:trainlen] #训练输出数据 test_output = data_output[trainlen:] #测试输出数据 data_input = min_max_scaler_input.fit_transform(data_input) #输入标准化 data_output = min_max_scaler_output.fit_transform(data_output) #输出标准化 x_train = data_input[:trainlen].reshape(trainlen, timesteps, data_dim) #训练输入
def normalization(data): data_np = data.values #returns a numpy array min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(data_np) X_train_nor = pd.DataFrame(x_scaled) return X_train_nor
def set_police_norm(self): crime = Crime() crime_police = crime.set_police_norm() police = pd.pivot_table(crime_police, index='구별', aggfunc=np.sum) print(f'{police.head()}') police['살인검거율'] = (police['살인 검거'] / police['살인 발생']) * 100 police['강간검거율'] = (police['강간 검거'] / police['강간 발생']) * 100 police['강도검거율'] = (police['강도 검거'] / police['강도 발생']) * 100 police['절도검거율'] = (police['절도 검거'] / police['절도 발생']) * 100 police['폭력검거율'] = (police['폭력 검거'] / police['폭력 발생']) * 100 police.drop(columns={'살인 검거', '강간 검거', '강도 검거', '절도 검거', '폭력 검거'}, axis=1) crime_rate_columns = ['살인검거율', '강간검거율', '강도검거율', '절도검거율', '폭력검거율'] for i in crime_rate_columns: police.loc[police[i] > 100, 1] = 100 # 데이터값의 기간오류로 100이 넘으면 100으로 계산 police.rename(columns={ '살인 발생': '살인', '강간 발생': '강간', '강도 발생': '강도', '절도 발생': '절도', '폭력 발생': '폭력', }, inplace=True) crime_columns = ['살인', '강간', '강도', '절도', '폭력'] x = police[crime_rate_columns].values min_max_scalar = preprocessing.MinMaxScaler() """ 스케일링은 선형변환을 적응하여 전체 자료의 분포를 평균 0, 분산 1이 되도록 만드는 과정 """ x_scaled = min_max_scalar.fit_transform(x.astype(float)) """ 정규화(normalization) 많이 양의 데이터를 처리함에 있어 여러 이유로 정규화, 즉 데이터의 범위를 일치시키거나 분포를 유사하게 만들어 주는 등의 작업. 평균값 정규화, 중간값 정규화 .. """ police_norm = pd.DataFrame(x - x_scaled, columns=crime_columns, index=police.index) police_norm[crime_rate_columns] = police[crime_rate_columns] cctv = Cctv() cctv_pop = cctv.get_cctv_pop() print(f'cctv_pop : {cctv_pop.head()}') police_norm['범죄'] = np.sum(police_norm[crime_rate_columns], axis=1) police_norm['검거'] = np.sum(police_norm[crime_columns], axis=1) print(f'police_norm columns :: {police_norm.columns}') reader = self.reader reader.context = os.path.join(baseurl, 'saved_data') reader.fname = 'police_norm.csv' police_norm.to_csv(reader.new_file(), sep=',', encoding='utf-8')
def clean_regression_data(input_path, output_path): """preparing preloaded data for regression and visualizaiton Warning: This function directly calls data_clean.csv from data folder Do not remove this file! Input: input_path - local path for data_clean.csv output_path - local path for outputing Output: Cleaned data frame ready for regression analysis and model building """ df = pd.read_csv(input_path, encoding="latin1") # drop unnecessary columns df = df.drop([ "Unnamed: 0", "imdb_id", "Title", "X.x", "X.y", "Country", "Actors", "Director", "Year", "Production" ], axis=1) # drop_missing values mis_val_col = ["Genre", "IMDB.Votes", "Runtime", "IMDB.Rating", "Language"] for col in mis_val_col: df = df.drop(df[df[col].isnull()].index) # budget df["budget"] = df["budget"].map(lambda x: math.log10(x)) # revenue df["revenue"] = df["revenue"].map(lambda x: math.log10(x)) # genre df = pd.concat([df, df['Genre'].str.get_dummies(sep=', ')], axis=1) df['Thriller'] = df[['Thriller', 'Horror']].sum(axis=1) df['Fantasy'] = df[['Fantasy', 'Sci-Fi']].sum(axis=1) df['Other_genre'] = df[[ 'Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News' ]].sum(axis=1) df.drop([ 'Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News', 'Horror', 'Sci-Fi' ], axis=1, inplace=True) genre_lst = list(df)[19:32] for x in genre_lst: df.loc[df['%s' % x] > 1, '%s' % x] = 1 df = df.drop("Genre", axis=1) # IMDB.Votes df['IMDB.Votes'] = df['IMDB.Votes'].replace(',', '', regex=True) df['IMDB.Votes'] = df['IMDB.Votes'].astype(int) df["IMDB.Votes"] = df["IMDB.Votes"].map(lambda x: math.log10(x)) # language df['Language'] = df.Language.str.count(',') + 1 # rated df["Rated"] = df["Rated"].replace(np.nan, "UNRATED")\ .replace("NOT RATED", "UNRATED") df = df.drop(df[(df["Rated"] == "TV-MA") | (df["Rated"] == "TV-PG") | (df["Rated"] == "TV-14")].index) df = pd.concat([df, df['Rated'].str.get_dummies(sep=', ')], axis=1) # released # index of released date col index = df.columns.get_loc("Released") # change date data to timestamp release_dates = pd.to_datetime(df["Released"]) # released date is weekend of not weekend_list = [] for each in release_dates: day_ofweek = each.dayofweek if day_ofweek >= 4 and day_ofweek <= 6: tag = 1 else: tag = 0 weekend_list.append(tag) # released date is on dump months undumpmonth_list = [] for each in release_dates: month = each.month if month == 12 or month == 1 or month == 2 or month == 8 or month == 9: tag = 0 else: tag = 1 undumpmonth_list.append(tag) df.insert(loc=index + 1, column="released_on_weekend", value=weekend_list) df.insert(loc=index + 2, column="released_not_on_dump_month", value=undumpmonth_list) df.drop("Released", axis=1) # runtime df["Runtime"] = df["Runtime"].map(lambda x: int(x.strip("min"))) # normalization x1 = df[[ 'IMDB.Rating', 'IMDB.Votes', 'Language', 'Runtime', 'budget', 'actor_popularity', 'director_popularity' ]] x2 = df[[ 'released_on_weekend', 'released_not_on_dump_month', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Thriller', 'Other_genre', 'G', 'NC-17', 'PG', 'PG-13', 'R', 'UNRATED' ]] y = df['revenue'].reset_index().drop("index", axis=1) normalizer = preprocessing.MinMaxScaler() x1 = normalizer.fit_transform(x1) x1 = pd.DataFrame(x1, columns=[ 'IMDB.Rating', 'IMDB.Votes', 'Language', 'Runtime', 'budget', 'actor_popularity', 'director_popularity' ]) x2 = x2.reset_index().drop("index", axis=1) X = pd.concat([x1, x2], axis=1) df_for_model = pd.concat([X, y], axis=1) df_for_model.to_csv(output_path, encoding="latin1") return df_for_model
Split into training and test set ********************************************************************************************************************* ''' X_train, X_test, y_train, y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.2, random_state=0) print ''' ********************************************************************************************************************* Standardize / Normalize ********************************************************************************************************************* ''' # fit the scaler on training set and apply same to test set #scaler = preprocessing.StandardScaler().fit(X_train) scaler = preprocessing.MinMaxScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #print scaler.mean_ #print scaler.scale_ print scaler.data_min_ print scaler.data_max_ print X_train[:5,:] print X_test[:5,:] print ''' ********************************************************************************************************************* Linear regression
from django.db.models import F from binarystars.models import InterpolatedBinaryStars import numpy as np from random import randint from sklearn.cluster import KMeans, DBSCAN from sklearn import preprocessing import binarystars.cluster.clusteredstar as cstar MAX_ROWS = 1001 # might have to change this to be a calculation like what is done in interpolate.py LOWER_SEED_BOUND = 1 # 2^31 .. just using a number that is high to try and get a good amount of UPPER_SEED_BOUND = 2147483648 DATA_PROCESSORS = { "minmax": preprocessing.MinMaxScaler(), "abs": preprocessing.MaxAbsScaler(), "standard": preprocessing.StandardScaler() } def preprocess_data(data: np.ndarray, standardizer: str) -> np.ndarray: return DATA_PROCESSORS[standardizer].fit_transform(data) def get_stars(n_clusters: int = None, n_samples: int = None, eps: float = None, standardizer: str = None, cluster_type: str = None, attributes: dict = None, time_steps: int = 1,
dataX, dataY = [], [] for i in range(len(dataset) - look_back - look_ahead - 1): a = dataset[i:(i + look_back), :] dataX.append(a) dataY.append(dataset[i + look_back + look_ahead, :]) return np.array(dataX), np.array(dataY) sds = pickle.load(open("./GitHub_misc/sds")) series = pickle.load(open("./GitHub_misc/series")) N, H, W = sds.shape gblur_size = 5 look_back = 15 look_ahead = 8 mmscaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) modelname = '360net' model3 = models.load_model( './GitHub_misc/model3_{}_128_w16_h9_4000'.format(modelname)) print model3.summary() headmap = np.array( [create_fixation_map(None, series, idx) for idx, _ in enumerate(series)]) headmap = np.array( [cv2.GaussianBlur(item, (gblur_size, gblur_size), 0) for item in headmap]) headmap = mmscaler.fit_transform(headmap.ravel().reshape(-1, 1)).reshape( headmap.shape) ds = np.zeros(shape=(N, 2, H, W)) ds[:, 0, :, :] = sds ds[:, 1, :, :] = headmap
def normalizedata(datain): min_max_scaler = preprocessing.MinMaxScaler() scaledata = min_max_scaler.fit_transform(datain) return scaledata
# 3. Plot the performance (such as error rate/accuracy) from reservoir import onlineESNWithRLS as ESN, ReservoirTopology as topology from plotting import OutputPlot as outputPlot import numpy as np import os from datetime import datetime from sklearn import preprocessing as pp from reservoir import Utility as util from performance import ErrorMetrics as rmse # Read data from the file data = np.loadtxt('darwin.slp.txt') # Normalize the raw data minMax = pp.MinMaxScaler((-1,1)) data = minMax.fit_transform(data).reshape((data.shape[0],1)) # Divide the data into training data and testing data trainingData, testingData = util.splitData2(data, 0.8) nTesting = testingData.shape[0] # Form feature vectors inputTrainingData, outputTrainingData = util.formFeatureVectors(trainingData) # Tune the network size = int(trainingData.shape[0]/10) initialTransient = 100 # Input-to-reservoir fully connected inputWeight = topology.ClassicInputTopology(inputSize=inputTrainingData.shape[1], reservoirSize=size).generateWeightMatrix()
# ### Scale Continous Values # In[41]: # obtain scales from train set from sklearn import preprocessing continous = train_df[[ 'trip_distance', 'fare_amount', 'tolls_amount', 'trip_time', 'avg_speed', 'Precipitation', 'Snow_depth', 'Snowfall', 'Max_temp', 'Min_temp', 'Avg_wind_speed', 'Gust_speed' ]] scaler = preprocessing.MinMaxScaler().fit(continous) continous = scaler.transform(continous) train_df[[ 'trip_distance', 'fare_amount', 'tolls_amount', 'trip_time', 'avg_speed', 'Precipitation', 'Snow_depth', 'Snowfall', 'Max_temp', 'Min_temp', 'Avg_wind_speed', 'Gust_speed' ]] = continous # In[42]: # apply scale to validation and test set # validation from sklearn import preprocessing
X.drop('name', axis=1, inplace=True) # Splice out the status column: Y = X['status'].copy() X.drop('status', axis=1, inplace=True) # Perform a train/test split: X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state=7) # Program a best-parameter search: scalers = { 'NoScaler': False, 'StandardScaler': preprocessing.StandardScaler(), 'Normalizer': preprocessing.Normalizer(), 'MaxAbsScaler': preprocessing.MaxAbsScaler(), 'MinMaxScaler': preprocessing.MinMaxScaler(), 'RobustScaler': preprocessing.RobustScaler() } best_score = 0 for sk, sv in scalers.items(): proc = sv if proc: proc.fit(X_train) tX_train = proc.transform(X_train) tX_test = proc.transform(X_test) else: tX_train = X_train.copy() tX_test = X_test.copy() # Check dimensionality reduction? (PCA, Isomap, None) choice = 2 if choice == 1:
]] # Load in the SC dataframe pickle_in = open("Rot3_data\\SC_full_df.pkl", "rb") Animal_SC = pickle.load(pickle_in) animals = [ 'AA01', 'AA03', 'AA05', 'AA07', 'DO04', 'DO08', 'SC04', 'SC05', 'VP01', 'VP07', 'VP08' ] # AA03 and SC04 don't do any trials sc = clean_up_sc(Animal_SC) # ESTIMATING THE HYPERPARAMETER # "The hyperparameter value (λ) was selected independently for each rat using evidence optimization, # on the basis of fivefold cross-validation." scaler = preprocessing.MinMaxScaler( ) # from the scaler transformation the intercept turns to zero # if does not make a difference whether it is included or not. x = scaler.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0) clf = LogisticRegressionCV(cv=5, random_state=0, fit_intercept=True) # For our example what is baseline accuracy etc. logreg = LogisticRegression(random_state=0, fit_intercept=True) logreg.fit(x_train, y_train) # look up what these values really mean y_pred = logreg.predict(x_test) print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) # Accuracy: 0.6352395672333848
def func2(): user = {} for line in fileinput.input("../../data/select/select_a"): mac = line.strip().split(" ")[0] user[mac] = True fileinput.close() cnt_0, cnt_1 = 0, 0 docMap_1, docMap_2, docMap_3, docMap_4, classMap = {}, {}, {}, {}, {} for line in fileinput.input( "../../data/feature/trace_all_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): if sex == 0: cnt_0 += 1 if sex == 1: cnt_1 += 1 _list = [] for f in feat: _list.append(float(f)) docMap_1[mac] = _list classMap[mac] = sex fileinput.close() print cnt_0, cnt_1 for line in fileinput.input( "../../data/feature/trace_online_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_2[mac] = _list fileinput.close() for line in fileinput.input( "../../data/feature/trace_http_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_3[mac] = _list fileinput.close() for line in fileinput.input("../../data/feature/keywords_normalize_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_4[mac] = _list fileinput.close() docList_1, docList_2, docList_3, docList_4, classList = [], [], [], [], [] # print len(user.keys()), len(docMap_1.keys()), len(docMap_2.keys()), len(docMap_3.keys()), len(docMap_4.keys()) for k, v in user.iteritems(): if k in docMap_1 and k in docMap_2 and k in docMap_3 and k in docMap_4 and k in classMap: docList_1.append(docMap_1[k]) docList_2.append(docMap_2[k]) docList_3.append(docMap_3[k]) docList_4.append(docMap_4[k]) classList.append(classMap[k]) docList_1, docList_2, docList_3, docList_4, classList = np.array( docList_1), np.array(docList_2), np.array(docList_3), np.array( docList_4), np.array(classList) min_max_scaler = preprocessing.MinMaxScaler() docList_1, docList_2, docList_3 = min_max_scaler.fit_transform( docList_1), min_max_scaler.fit_transform( docList_2), min_max_scaler.fit_transform(docList_3) cnt, errorCount = 0, 0 loo = LeaveOneOut(len(classList)) trainingdoc, trainingclass = [], [] # file = open("../../data/prediction/result","w") for train, test in loo: cnt += 1 print cnt trainingdoc_1, trainingdoc_2, trainingdoc_3, trainingdoc_4, trainingclass, testingdoc_1, testingdoc_2, testingdoc_3, testingdoc_4, testingclass\ = docList_1[train], docList_2[train], docList_3[train], docList_4[train], classList[train], docList_1[test], docList_2[test], docList_3[test], docList_4[test], classList[test] clf_1 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) clf_2 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) clf_3 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) gnb = MultinomialNB() clf_1.fit(trainingdoc_1, trainingclass) clf_2.fit(trainingdoc_2, trainingclass) clf_3.fit(trainingdoc_3, trainingclass) gnb.fit(trainingdoc_4, trainingclass) docList_final = [] for one in train: res_1 = clf_1.predict_proba(docList_1[one])[0] res_2 = clf_2.predict_proba(docList_2[one])[0] res_3 = clf_3.predict_proba(docList_3[one])[0] res_4 = gnb.predict_proba(docList_4[one])[0] _list = [ res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1], res_4[0], res_4[1] ] docList_final.append(_list) res_1 = clf_1.predict_proba(testingdoc_1)[0] res_2 = clf_2.predict_proba(testingdoc_2)[0] res_3 = clf_3.predict_proba(testingdoc_3)[0] res_4 = gnb.predict_proba(testingdoc_4)[0] testing_final = [ res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1], res_4[0], res_4[1] ] print testing_final