def fit_dbscan(data, eps, min_samples, normalize=True, show=True, juxta_cluster_indices_grouped=None, threshold_legend=None): X = np.transpose(data) if normalize: from sklearn.preprocessing import minmax_scale minmax_scale(X, feature_range=(-1, 1), axis=0, copy=False) from sklearn.cluster import DBSCAN from sklearn import metrics db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) score = metrics.silhouette_score(X, labels, sample_size=5000) print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_)) print("Silhouette Coefficient: {}".format(score)) if show: pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend) return db, n_clusters_, labels, core_samples_mask, score
def combine(a, b, w): matches = {} # split dictionaries into keys and values al = [x for x in a.items()] ak, av = zip(*al) bl = [x for x in b.items()] bk, bv = zip(*bl) # scale the values in the range 0-1 a_scaled = preprocessing.minmax_scale(av, feature_range=(0,1)) b_scaled = preprocessing.minmax_scale(bv, feature_range=(0,1)) # build numpy structured arrays combining scaled values and original keys names = ['keys', 'values'] formats = ['S225', 'f8'] dtype = dict(names=names, formats=formats) anp = np.array(list(zip(ak,a_scaled)), dtype=dtype) bnp = np.array(list(zip(bk,b_scaled)), dtype=dtype) # iterate over numpy structures creating a weighted average between values with the same key for i, t1 in np.ndenumerate(anp): for j, t2 in np.ndenumerate(bnp): if anp['keys'][i] == bnp['keys'][j]: stack = np.vstack((anp['values'][i], bnp['values'][j])) matches[anp['keys'][i].decode("utf-8")] = np.average(stack, axis=0, weights=w)[0] # python dictionary return matches
def train_model_with_feature(config_name, clf_name, fill_na_opt, PCA_n_comp, clf, X, X_test, y): if PCA_n_comp!=-1: pca = PCA(PCA_n_comp) #PCA dimension reduction logger.info('PCA fit on count matrix') # rescale num to (0,1) X_all = pca.fit_transform( minmax_scale(np.vstack([X, X_test])) ) X, X_test = X_all[:X.shape[0], :], X_all[X.shape[0]:, :] logger.info('PCA fit done') logger.info('start training') print 'training size', X.shape, 'test size', X_test.shape X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9) if clf_name=='xgb': clf.fit(X_train,y_train,eval_metric='mlogloss') else: clf.fit(X_train,y_train) logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') train log-loss='\ +str(log_loss(y_train, clf.predict_proba(X_train)))) logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') validate log-loss='\ +str(log_loss(y_val, clf.predict_proba(X_val)))) clf.fit(X, y) y_pred = clf.predict_proba(X_test) df_test[group_list] = y_pred logger.info('finish training') # , 'phone_brand_en', 'device_model_en' df_test.to_csv('output/'+config_name+'-'+clf_name+'-'+fill_na_opt+'-pca'+\ str(PCA_n_comp)+'-'+str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))\ +'.csv', columns=['device_id']+group_list, index=False) logger.info('finish outputing result')
def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label): """ SKLearn을 사용해서 Pandas를 Proprocessing label은 Preprocessing 하면 안됨 Args: params: * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] * _df_csv_read_ori : pandas dataframe * _label Returns: Preprocessing DataFrame """ if _preprocessing_type == None or _preprocessing_type == 'null': logging.info("No Preprocessing") result_df = _df_csv_read_ori else : logging.info("Preprocessing type : {0}".format(_preprocessing_type)) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] for i, v in _df_csv_read_ori.dtypes.iteritems(): if v in numerics: if i not in _label: #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] #_preprocessing_type = ['maxabs_scale'] if 'scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0)) if 'minmax_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0)) if 'robust_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0)) if 'normalize' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0)) if 'maxabs_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0)) result_df = _df_csv_read_ori return result_df
def resample(spectrum, resampled_header_broadcast, label_col=None, convolve=False, normalize=True): """Resamples the spectrum so that the x-axis starts at low and ends at high, while keeping the delta between the wavelengths""" resampled_header = resampled_header_broadcast.value if label_col is not None: logger.debug(spectrum.columns) without_label = spectrum.drop(label_col, axis=1) without_label.columns = pd.to_numeric(without_label.columns, errors="ignore") else: without_label = spectrum if convolve: to_interpolate = convolution.convolve(without_label.iloc[0].values, convolution.Gaussian1DKernel(7), boundary="extend") else: to_interpolate = without_label.iloc[0].values logger.debug(without_label) interpolated = np.interp(resampled_header, without_label.columns.values, to_interpolate) interpolated = interpolated[3:-3] # remove some weird artefacts that might happen because of convo/interpolation if normalize: interpolated = prep.minmax_scale([interpolated], axis=1) logger.debug("Interpolated:%s", interpolated) interpolated_df = pd.DataFrame(data=interpolated, columns=resampled_header[3:-3], index=spectrum.index.values) if label_col is not None: interpolated_df[label_col] = spectrum[label_col] return interpolated_df
def scale(x): m, n = x.shape x = np.reshape(x, (m*n, 1)) x = minmax_scale(x, feature_range=(0, 1), axis=0) data = np.reshape(x, (m, n)) return data
def data_preprocess_musk_svm(bags): bag_labels = list() instances = list() for bag in bags: [n_instances, _] = bag['instances'].shape if 0 == bag['label']: bag['label'] = 0 bag['inst_labels'] = np.zeros([n_instances, ]) bag_labels.append(0) else: bag['label'] = 1 bag['inst_labels'] = np.ones([n_instances, ]) bag_labels.append(1) instances.extend(bag['instances']) # bag['instances'] /= 100 # bag['instances'] = preprocessing.minmax_scale(bag['instances'], axis=1, feature_range=(-1, 1)) # bag['instances'] = preprocessing.normalize(bag['instances'], axis=1) instances = np.asarray(instances) instances = preprocessing.minmax_scale(instances, axis=0, feature_range=(-1, 1)) # instances = preprocessing.minmax_scale(instances, axis=0, feature_range=(0, 1)) # instances = preprocessing.normalize(instances, norm='l2', axis=0) inst_idx = 0 for bag in bags: [n_instances, _] = bag['instances'].shape bag['instances'] = instances[inst_idx: inst_idx + n_instances, :] inst_idx += n_instances return bags, bag_labels
def get_outliers_by_ransac(self, table, column_indexes): ''' Get outliers using RANSAC regression, which deals better with large outliers in the y direction, and faster than Huber when the number of samples is very large. RANSAC outpus perfect precision (100%) but far from perfect recall (could be 50% - 60%) in our experiments. ''' X = table[ :, column_indexes[ :-1]].astype(float) X = utils.enforce_columns(X) y = table[ :, column_indexes[-1]].astype(float) # preprocessing doesn't make any difference for RANSAC in our experiments #x = preprocessing.minmax_scale(x) #y = preprocessing.minmax_scale(y) model_ransac = RANSACRegressor(LinearRegression()) model_ransac.fit(X, y) inlier_mask = model_ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) outliers = [idx for idx, val in enumerate(outlier_mask) if val] residuals = abs(model_ransac.predict(X) - y) confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9 return (outliers, confidences)
def ensemble(): with open('train_test_norm.pickle') as f: train_set,test_set = pickle.load(f) m,n = len(train_set),len(train_set[0]['search_idx']) X = np.zeros([m,n]) Y = np.zeros([m,]) for i in range(m): movie = train_set[i] X[i,:] = np.asarray(movie['search_idx'],float) Y[i] = float(movie['total_money']) Y /= np.max(Y) X = minmax_scale(X) W = calc_W(Y) result = [] kf = cross_validation.KFold(n=m, n_folds=10, shuffle=True,random_state=None) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] W_train, W_test = W[train_index], W[test_index] boost = train_with_search_index(X_train,y_train,W_train) classifier,regressors = train_with_kmeans(X_train,y_train,W_train) result.append(score_en(boost,classifier,regressors,X_test,y_test))
def parse_file(file_name): with open(file_name) as f: lines = f.readlines() with open('params.pickle') as f: params = pickle.load(f) m,n = len(lines),31+31+50+len(params['T'])+len(params['P'])+1+1+1 X = np.zeros([m,n]) names = [] for i,line in enumerate(lines): units = line.split(',') units = [unit.decode('utf-8') for unit in units] X[i,0:31] = np.array([float(units[c]) for c in range(31)],float) X[i,31:62] = np.array([float(units[c]) for c in range(31,62)],float) img_path = units[62] img = plt.imread(img_path) img[:] = 0 types = set(units[63].split(';')) for c,typ in enumerate(params['T']): if typ in types: X[i,112+c] = 1 P = units[64] X[i,112+len(params['T'])+params['P'][P]] = 1 X[i,112+len(params['T'])+len(params['P'])] = float(units[65]) X[i,112+len(params['T'])+len(params['P'])+1] = float(units[67]) X[i,112+len(params['T'])+len(params['P'])+2] = float(units[66]) - params['B'] names.append(units[68].strip()) X = minmax_scale(X[:,0:31]) return X,names
def calcImageHist(imagePath, nbins): img = io.imread(imagePath) imgComposed = np.apply_along_axis(convColors, 2, img) imgNorm = preprocessing.minmax_scale(imgComposed.astype(float))[0] return np.histogram(imgNorm, bins=nbins)
def minMaxScale(df): """ 将数据最大最小化标准化 :param df: 传入DataFrame :returns: 标准化后的数据 """ if not isinstance(df, pd.DataFrame): raise Exception("df is not DataFrame!") return preprocessing.minmax_scale(df)
def scale_bag(bags): bag_labels = list() for bag in bags: [n_instances, _] = bag['instances'].shape if 0 == bag['label']: bag['inst_labels'] = np.zeros([n_instances, ]) bag_labels.append(0) else: bag['inst_labels'] = np.ones([n_instances, ]) bag_labels.append(1) bag['instances'] = preprocessing.minmax_scale(bag['instances'], axis=1, feature_range=(0, 1)) return bags, bag_labels
def rank_plt(self, ylim=(0.0, 1.1)): if not hasattr(self, 'rank'): self.rank_get() fig, axs = plt.subplots(figsize=(13, 5)) axs.set_ylim(ylim) axs.grid(True) # movIdx X = np.array([self[uuid].logMI() for uuid in self.rank]) axs.plot(minmax_scale(X), color='y', label='movIdx') # number of bSite reports X = np.array([np.sum(self[uuid].xLastR('site', self.xLast)) for uuid in self.rank], dtype='float') axs.plot(minmax_scale(X), color='c', label='bSites') # number of adult reports X = np.array([np.sum(self[uuid].xLastR('adult', self.xLast)) for uuid in self.rank], dtype='float') axs.plot(minmax_scale(X), color='m', label='adults') # score X = np.array([self[uuid].score for uuid in self.rank]) axs.plot(X, color='b', label='uScore') # legend plt.legend(loc='upper left') # plt.title('+++ ranked scoring +++') plt.show()
def sampling_pdf(self, n_sample=20): model = self.model values = self.values vd_samples = values[random.choice(len(values), n_sample)] pvalues = model.predict(vd_samples) pvalues = minmax_scale(pvalues, feature_range=(0, 1), axis=0, copy=False) header = list(self.df.columns) + ["proba"] body = np.concatenate([vd_samples, np.asarray([pvalues]).T], axis=1) sorted_body = body[body[:,-1].argsort()[::-1]] report_df = pd.DataFrame(data=sorted_body, columns=header) return report_df
def ConstructArrays(array): shape = np.shape(array) width = shape[1] onesArray = np.ones((len(array),1)) splitArrays = np.split(array, [width - 1], 1) splitArrays[0] = prep.minmax_scale(splitArrays[0]) featuresArray = np.hstack([onesArray, splitArrays[0]]) outputArray = splitArrays[1] 'prep.minmax_scale(splitArrays[1])' return featuresArray, outputArray
def normalize(img): """Normalize image intensity.""" assert img.ndim == 4 logging.info('Preprocessing: %s', array_info(img)) info = img.info original_shape = img.shape img = img.reshape((-1, img.shape[-1])) img = preprocessing.minmax_scale(img) # img = preprocessing.scale(img) # img = preprocessing.robust_scale(img) img = img.reshape(original_shape) img = dwi.image.Image(img, info=info) return img
def preprocess(self): """ Normalization of data between 0 and 1 and subtraction of the nuggets Returns: pandas.core.frame.DataFrame: Dataframe containing the transformed data pandas.core.frame.DataFrame: Containing the substracted nuggets """ import sklearn.preprocessing as skp # Normalization scaled_data = pn.DataFrame(skp.minmax_scale(self.exp_var_raw[self.properties]), columns=self.properties) # Nuggets nuggets = scaled_data[self.properties].iloc[0] processed_data = scaled_data - nuggets return processed_data, nuggets
def add_golden_features(X): with_added_features = X for i in range(len(X.columns)): stdout.write("\r%d out of %d" %(i,len(X.columns))) stdout.flush() for j in range(i+1, len(X.columns)): new_col_mult = preprocessing.scale(np.array(X.ix[:,i]*X.ix[:,j])) new_col_plus = preprocessing.scale(np.array(X.ix[:,i]+X.ix[:,j])) with_added_features = np.column_stack((with_added_features, new_col_mult, new_col_plus)) new_col_log = np.log(preprocessing.minmax_scale(np.array(X.ix[:,i]), feature_range=(0,1))) new_col_rank = preprocessing.scale(np.array(rankdata(X.ix[:,i]))) with_added_features = np.column_stack((with_added_features, new_col_log, new_col_rank)) reduced_38 = dim_reduce(X.ix[:, :38]) with_added_features = np.column_stack((with_added_features, reduced_38)) stdout.write("\n") s0, s1 = with_added_features.shape columns = ['gf_'+str(j) for j in range(s1)] df2 = pd.DataFrame(with_added_features, columns=columns) return df2
def scale_inst(bags): bag_labels = list() instances = list() for bag in bags: [n_instances, _] = bag['instances'].shape if 0 == bag['label']: bag['inst_labels'] = np.zeros([n_instances, ]) bag_labels.append(0) else: bag['inst_labels'] = np.ones([n_instances, ]) bag_labels.append(1) instances.extend(bag['instances']) instances = np.asarray(instances) instances = preprocessing.minmax_scale(instances, axis=1, feature_range=(0, 1)) inst_idx = 0 for bag in bags: [n_instances, _] = bag['instances'].shape bag['instances'] = instances[inst_idx: inst_idx + n_instances, :] inst_idx += n_instances return bags, bag_labels
def cell_fd_extention(fname_org='sheet.gz/cell_db.cvs.gz', camera_bit_resolution=14): cell_df = pd.read_csv(fname_org) Limg, Lx, Ly = cell_fd_info(cell_df) cell_df_ext = cell_df.copy() # Fresnel diffraction cell_img_fd_a = cell_fd_conv(cell_df) cell_df_ext['freznel image'] = cell_img_fd_a.reshape(-1) # max_v, min_v = np.max(cell_df["image"]), np.min(cell_df["image"]) cell_img_fd_a_2d = cell_img_fd_a.reshape(Limg, -1) cell_img_fd_a_2d_scale = preprocessing.minmax_scale( np.abs(cell_img_fd_a_2d)) * (2**camera_bit_resolution) cell_img_fd_a_2d_scale_200x144x144 = cell_img_fd_a_2d_scale.reshape( Limg, Lx, Ly).astype(int) cell_df_ext[ 'mag freznel image'] = cell_img_fd_a_2d_scale_200x144x144.reshape(-1) return cell_df_ext
def dataEncodeFor(approach, oriData): if(approach is 'AE'): encodedAEData = dataEncode(oriData) npen = encodedAEData.data.numpy() np.save('aeEncode.npy', npen) return npen if (approach is 'PCA'): return myPca.dataEncode(oriData, white=False) if (approach is 'PCA2'): return myPca.dataEncode2(oriData, white=False,stay=0.85) if (approach is 'KPCA'): kpcaEncode = myKpca.dataEncode('rbf', oriData, 90) return kpcaEncode if (approach is 'KPCA2'): kpcaEncode = myKpca.dataEncode('rbf', oriData, 90) return kpcaEncode if (approach is 'to01'): return preprocessing.minmax_scale(oriData) if (approach is 'Normal'): return preprocessing.scale(oriData)
def prepareDataSet(self, iris): self.int_num_classes = numpy.unique(iris.target).shape[0] self.int_set_size = iris.data.shape[0] self.int_num_features = iris.data.shape[1] ## normalize data self.data = preprocessing.normalize(iris.data) self.data = preprocessing.minmax_scale(self.data, (-1, 1)) # load data in arrays for i in range(0, len(self.data)): Y = iris.target[i] X = numpy.append(self.data[i], 1) ## bias input = 1 check_i = i % self.int_num_per_class if check_i < self.int_training_size: self.training.append([X, Y]) else: self.testing.append([X, Y]) numpy.random.shuffle(self.training) numpy.random.shuffle(self.testing)
def normalise_data(train_data: np.ndarray, test_data: np.ndarray): from sklearn import preprocessing print("np.max(train_data): " + str(np.max(train_data))) print("np.ptp(train_data): " + str(np.ptp(train_data))) normalised_1 = 1 - (train_data - np.max(train_data)) / -np.ptp(train_data) normalised_2 = preprocessing.minmax_scale(train_data, axis=1) print(train_data[0]) train_data /= 16 test_data /= 16 print("Are arrays equal: " + str(np.array_equal(normalised_2, train_data))) print("Are arrays equal: " + str(np.array_equal(normalised_1, train_data))) for i in range(0, 1): print(train_data[i]) print(normalised_1) print(normalised_2)
def dataEncodeFor(approach, oriData,stay=0.95): if(approach is 'AE'): encodedAEData = dataEncode(oriData, stay if stay%1==0 else stay*331) npen = encodedAEData.data.numpy() # np.save('aeEncode.npy', npen) return npen # np.load('aeEncode.npy') # return npen if (approach is 'PCA'): return myPca.dataEncode(oriData, white=False,stay=stay) if (approach is 'PCA2'): return myPca.dataEncode2(oriData, white=False,stay=0.85) if (approach is 'KPCA'): kpcaEncode = myKpca.dataEncode('rbf', oriData, stay if stay%1==0 else stay*331) return kpcaEncode if (approach is 'KPCA2'): kpcaEncode = myKpca.dataEncode('rbf', oriData, stay if stay%1==0 else stay*331) return kpcaEncode if (approach is 'to01'): return preprocessing.minmax_scale(oriData) if (approach is 'Normal'): return preprocessing.scale(oriData)
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL): # data = pd.read_csv(file_in, dtype='str') # data['DateTime'] = pd.to_datetime( # data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str), # format='%Y%m%d%H%M%S') # data = data.set_index('DateTime') # data = pd.Series(data['<CLOSE>']).map(float) # data = data.resample('M').fillna(method='pad') # data = preprocessing.minmax_scale(data) # data_t = data[6:] # data_f = data.reshape(-1, 6) # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)]) # np.save(file_out[0], data_f[:len(data_f) - 1]) # np.save(file_out[1], data_t) data = preprocessing.minmax_scale(pd.read_pickle( file_in)['close']) data_m = np.array([[data[i + x * 24 * 24] for x in range(6)] for i in range(len(data) - 6 * 24 * 24 + 1)]) data_m = data_m.reshape(-1, 6) data_s = np.array([data[i + 6 * 24 * 24] for i in range(len(data) - 6 * 24 * 24)]) np.save(file_out[0], data_m[:len(data_m) - 1]) np.save(file_out[1], data_s)
def dataEncode(data, stay, EPOCH = 500, BATCH_SIZE = 32, LR = 0.0001 ): oriData = data scData = preprocessing.minmax_scale(oriData) scData = torch.from_numpy(scData).type(torch.FloatTensor) oriSc = Variable(scData) train_loader = Data.DataLoader(dataset=scData, batch_size=BATCH_SIZE, shuffle=True) ae = AutoEncoder(stay) optimizer = torch.optim.Adam(ae.parameters(), lr=LR) loss_func = nn.MSELoss() for epoch in range(EPOCH): for step, x in enumerate(train_loader): tx = Variable(x) ty = Variable(x) encoded, decoded = ae(tx) loss = loss_func(decoded, ty) # mean square error optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients if step % 1 == 0: print('Epoch: ', epoch, '| train loss: %.6f' % loss.data[0]) # print(len(loss)) encoded_data, _ = ae(oriSc) return encoded_data
def get_outliers_by_huber(self, table, column_indexes): ''' Get outliers using huber regression, which outperforms RANSAC, but doesn't scale well when the number of samples are very large. Huber outputs both perfect precision (100%) and recall (100%) in our experiments. ''' X = table[ :, column_indexes[ :-1]].astype(float) X = utils.enforce_columns(X) y = table[ :, column_indexes[-1]].astype(float) # preprocessing could make HUBER fail on some dataset in our experiments #x = preprocessing.minmax_scale(x) #y = preprocessing.minmax_scale(y) model_huber = HuberRegressor() model_huber.fit(X, y) outlier_mask = model_huber.outliers_ outliers = [idx for idx, val in enumerate(outlier_mask) if val] residuals = abs(model_huber.predict(X) - y) confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9 return (outliers, confidences)
def build_images(): """Completely load, resize, and save the images for training. Main function.""" # get image file paths for each image type path_stub = r'D:\Users\James\Dropbox\Programming\Python\Projects\pylinac test files' pf_files = get_image_files(osp.join(path_stub, 'Picket Fences')) pipspro_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'PipsPro')) leeds_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'Leeds')) wl_files = get_image_files(osp.join(path_stub, 'Winston-Lutz')) # cbct_files = get_image_files(osp.join(path_stub, 'CBCTs')) filepaths = pf_files + pipspro_files + leeds_files + wl_files print("{} files found".format(len(filepaths))) # preallocate total_array = np.zeros((len(filepaths), 10000), dtype=np.float32) print("Training array preallocated") # resize each image and add to a training array start = time.time() futures = {} with concurrent.futures.ThreadPoolExecutor(max_workers=32) as exec: for idx, path in enumerate(filepaths): future = exec.submit(process_image, path) futures[future] = idx for idx, future in enumerate(concurrent.futures.as_completed(futures)): total_array[futures[future], :] = future.result() print("Training array set in {:.2f}s".format(time.time() - start)) # feature scale the images scaled_array = preprocessing.minmax_scale(total_array, feature_range=(0, 1), axis=1) print("Training array scaled") # save arrays to disk for future use np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'images'), scaled_array) np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'labels'), np.concatenate( (np.repeat(0, len(pf_files)), np.repeat(1, len(pipspro_files)), np.repeat(2, len(leeds_files)), np.repeat(3, len(wl_files))))) print("Images build")
def scale_data_minmax(X): #Xt = preprocessing.scale(X) Xt = preprocessing.minmax_scale(X) return Xt
# 2. Preparing More Features from sklearn.preprocessing import minmax_scale # The holdout set has a missing value in the Fare column which # we'll fill with the mean. holdout["Fare"] = holdout["Fare"].fillna(train["Fare"].mean()) columns = ['SibSp', 'Parch', 'Fare'] train['Embarked'] = train['Embarked'].fillna('S') train = create_dummies(train, "Embarked") holdout['Embarked'] = holdout['Embarked'].fillna('S') holdout = create_dummies(holdout, "Embarked") for col in columns: train[col + "_scaled"] = minmax_scale(train[col]) holdout[col + "_scaled"] = minmax_scale(holdout[col]) print(train) # 3. Determining the Most Relevant Features import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression import pandas as pd columns = ['Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled']
def rating_pred_binary(rating_pred): # Normalize the predictions rating_pred[DEFAULT_PREDICTION_COL] = minmax_scale( rating_pred[DEFAULT_PREDICTION_COL].astype(float)) return rating_pred
def save_left_out_matrix(alpha, tax_ids, left_out_tax_id, blast_folder='./blast_files/', network_folder='./network_files/', block_matrix_folder='./block_matrix_files', version=1): ''' Function assumes all necessary block matrices have already been computed, and network files (including left out one, for protein ids only) have been downloaded from STRING Need to make a function to compute S^{T}S (bipartite graph projection) for every IsoRank matrix related to the left-out matrix, and then averages them to get the predicted network. Save this network, and then using BLAST matrix of species with itself and compute IsoRank between the predicted network and the BLAST network. ''' print('Save left out matrix!') print('Tax ids:') print(tax_ids) tax_id_combos = [] used_tax_ids = [tax_id for tax_id in tax_ids if tax_id != left_out_tax_id] for ii in range(0, len(tax_ids)): tax_id_combos.append((tax_ids[ii], left_out_tax_id + '-leaveout')) print(tax_id_combos) pool = Pool(int(multiprocessing.cpu_count())) #isorank_blocks = pool.starmap(load_single_isorank_block, zip(tax_id_combos, itertools.repeat(alpha), itertools.repeat(block_matrix_folder))) network_file = network_folder + left_out_tax_id + "_networks_string.v11.0.pckl" leftout_prot2index, A, left_out_net_prots = load_adj(network_file) if version == 1: # S transpose S print('VERSION 1 (S^{T}S)') isorank_blocks = [ load_single_isorank_block(*args) for args in zip(tax_id_combos, itertools.repeat(alpha), itertools.repeat(block_matrix_folder)) ] replacements = [ get_s_transpose_s(isorank_block.todense()) for isorank_block in isorank_blocks ] elif version == 2: # S matrix network projection print( 'VERSION 2 (S^{T}AS) S MATRIX NETWORK PROJECTION WITH NONLEFTOUT ORGANISM\'S NETWORK' ) replacements = [] for tax_id_combo in tax_id_combos: nonleftout_taxon = tax_id_combo[0] network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl" _, nonleftout_net, _ = load_adj(network_file) isorank_block = load_single_isorank_block(tax_id_combo, alpha, block_matrix_folder) replacements.append( get_s_transpose_A_s(isorank_block.todense(), nonleftout_net.todense())) elif version == 3: # blast only baseline print('VERSION 3 (R^{T}R) BLAST ONLY') replacements = [] for tax_id_combo in tax_id_combos: nonleftout_taxon = tax_id_combo[0] network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl" prot2index_1, _, _ = load_adj(network_file) R = load_blast_from_taxa(nonleftout_taxon, left_out_tax_id, prot2index_1, leftout_prot2index, blast_folder) replacements.append(get_s_transpose_s(R.todense())) elif version == 4: # blast network projection print('VERSION 4 (R^{T}AR) BLAST NETWORK PROJECTION') replacements = [] for tax_id_combo in tax_id_combos: nonleftout_taxon = tax_id_combo[0] network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl" prot2index_1, nonleftout_net, _ = load_adj(network_file) nonleftout_net = nonleftout_net.todense() R = load_blast_from_taxa(nonleftout_taxon, left_out_tax_id, prot2index_1, leftout_prot2index, blast_folder).todense() replacements.append(get_s_transpose_A_s(R, nonleftout_net)) else: raise NotImplementedError( 'Version for making left out network matrix must be either 1, 2, 3, 4.' ) replacements = np.array(replacements) print(replacements.shape) #replacements = pool.starmap(get_ss_transpose, zip(isorank_blocks)) left_out_matrix = np.mean(replacements, axis=0) print(left_out_matrix.shape) density = np.count_nonzero(left_out_matrix) / (left_out_matrix.shape[0] * left_out_matrix.shape[1]) print(left_out_matrix) print('Density of left out matrix: ' + str(density)) left_out_matrix = minmax_scale(left_out_matrix) print(left_out_matrix) density = np.count_nonzero(left_out_matrix) / (left_out_matrix.shape[0] * left_out_matrix.shape[1]) print('Density of left out matrix after minmax scaling: ' + str(density)) left_out_fname = network_folder + left_out_tax_id + "_leftout_network_using_" + ','.join( used_tax_ids) + '_version_' + str(version) + "_string.v11.0.pckl" left_out_feats = {} left_out_feats['net'] = sparse.csr_matrix(left_out_matrix) left_out_feats['prot_IDs'] = left_out_net_prots print(left_out_feats.keys()) print('Dumping ' + left_out_fname) pickle.dump(left_out_feats, open(left_out_fname, 'wb'), protocol=4) print( 'Making IsoRank block of leaveout species with intraspecies blast connections' ) save_single_isorank_block((left_out_tax_id, left_out_tax_id), alpha, network_folder, blast_folder, block_matrix_folder, False, True, used_tax_ids=used_tax_ids, version=version)
image1 = crop_image_from_gray(image1) image1 = cv2.resize(image1, (512, 512), Image.ANTIALIAS) return image1 #files = next(os.walk('A:/HealthAnalytics/image_Processing/Project/images'))[2] path = '.\Project' files = next(os.walk(path + '/Train_data'))[2] for filename in files: #path='A:/HealthAnalytics/image_Processing/Project/images/'+filename path = path + '/Train_data' + filename image = cv2.imread(path) color_processed = preprocess_image(image) shape = color_processed.shape image_scaled = sk.minmax_scale(color_processed.ravel(), feature_range=(0, 1)).reshape(shape) #plt.imshow(color_processed) #plt.hist(color_processed.ravel(), bins=256, range=(0.0, 1.0), fc='k', ec='k') #calculating histogram #plt.hist(entr_img.ravel(), bins=256, range=(0.0, 1.0), fc='k', ec='k') #calculating histogram rgbimage = rgb2gray(image_scaled) entr_img = entropy(rgbimage, disk(10)) entr_img_resize = cv2.resize(entr_img, (256, 256), Image.ANTIALIAS) #plt.imshow(entr_img_resize) image_entropy = toimage(entr_img_resize) #plt.imshow(image_entropy) image_entropy.save(path + filename)
print(train.columns) ## 2. Preparing More Features ## from sklearn.preprocessing import minmax_scale # The holdout set has a missing value in the Fare column which # we'll fill with the mean. holdout["Fare"] = holdout["Fare"].fillna(train["Fare"].mean()) columns = ['SibSp', 'Parch', 'Fare'] train["Embarked"] = train['Embarked'].fillna('S') holdout["Embarked"] = holdout['Embarked'].fillna('S') train = create_dummies(train, 'Embarked') holdout = create_dummies(holdout, 'Embarked') for col in columns: train[col + '_scaled'] = minmax_scale(train[col]) holdout[col + '_scaled'] = minmax_scale(holdout[col]) ## 3. Determining the Most Relevant Features ## import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression columns = [ 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled' ]
''' Created on May 23, 2016 @author: ahanagrawal ''' import numpy as np from sklearn import preprocessing as prep if __name__ == '__main__': a = np.random.rand(5,5) print(a) print(prep.minmax_scale(a)) # 0.56391324 - 0.0560/(0.9158 - 0.0560)
def save(obj, files): import cPickle with open(files, 'wb') as f: cPickle.dump(obj, f) if __name__ == '__main__': paras = [] resultFile = [] for sparess in range(5, 6, 5): for fileNum in range(1, 2): trainFile = "/root/AAA/dataset/qos/tp/train/sparseness%s/training%d.txt" % ( sparess, fileNum) side = 'user' ws = False for eps in [0.01, 0.1, 1.5, 1, 2, 5, 10, 15, 20, 40, 60, 80]: for min_samples in [1, 2, 4, 6, 8, 10]: saveFile = "/root/AAA/dataset/qos/tp/PoolCoOccurrenceMatrix-%s-%d-%d-%d-%d" % ( side, sparess, fileNum, eps, min_samples) resultFile.append(saveFile) paras.append(([trainFile, eps, min_samples, ws], None)) threadNum = 20 pool = threadpool.ThreadPool(threadNum) requests = threadpool.makeRequests(createCoOccurrenceMatrixByDBSCAN, paras) results = [pool.putRequest(req) for req in requests] pool.wait() for index, result in enumerate(results): result = minmax_scale(result, axis=1) np.savetxt(resultFile[index], result, delimiter='\t', fmt='%f')
def minmaxscale(series):#归一化 series_scale = preprocessing.minmax_scale(series, feature_range=(0, 1)) return series_scale
f_path = dir_path + f # if os.path.isdir(f_path): # continue # if f != 'ECG200': # continue test_data = numpy.loadtxt(f_path, delimiter=',') label = test_data[:, 0] test_data = test_data[:, 1:] rows, cols = test_data.shape label_count = dict() for i in range(rows): label_count[label[i]] = label_count.get(label[i], 0) + 1 test_data = scale(test_data, axis=1) # k = config.get(f) k = 8 center, width = get_seg_info(k) begin = time.clock() t_data = preprocessing(test_data, k) predict = minmax_scale(cal_score(test_data, t_data, cols, k)) end = time.clock() # auc = roc_auc_score(label, predict) score_ratio, pred = cal_score_ratio(label, predict) error = mean_squared_error(label, pred) auc = recall_score(label, pred, average='macro') # score_ratio = cal_score_ratio(label, predict) print("Data=%s, AUC=%f, error=%f, Score_ratio=%f, Time=%f" % (f, auc, error, score_ratio, (end - begin)))
def transform(self, topics_and_res): from sklearn.preprocessing import minmax_scale topics_and_res = topics_and_res.copy() topics_and_res["score"] = topics_and_res.groupby( 'qid')["score"].transform(lambda x: minmax_scale(x)) return topics_and_res
Data_Set10 = pd.concat([Data_Set8,New_Col], axis = 1) """"""""""""""" Dummy Variables """"""""""""""" Data_Set10.info() Data_Set11 = pd.get_dummies(Data_Set10) Data_Set11.info() """"""""""""""" Normalization """"""""""""""" from sklearn.preprocessing import minmax_scale, normalize # First Method: Min Max Scale Data_Set12 = minmax_scale(Data_Set11, feature_range=(0,1)) Data_Set13 = normalize(Data_Set11, norm = 'l2', axis = 0) # axis = 0 for normalizing features / axis = 1 is for normalizing each sample Data_Set13 = pd.DataFrame(Data_Set13,columns = ['Time','E_Plug','E_Heat', 'Price','Temp', 'OffPeak','Peak'])
from sklearn.decomposition import PCA from sklearn import preprocessing import numpy as np import pandas as pd from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt #データの読み込み df = pd.read_csv("wine.csv") x_table = df.drop(columns="Wine") x = x_table.values name = x_table.columns #データの前処理 x = preprocessing.minmax_scale(x) #モデルの定義 pca = PCA(n_components=len(x[0])) #学習 pca.fit(x) #寄与率 con = pca.explained_variance_ratio_ index = [] for i in range(len(con)): index.append("第" + str(i + 1) + "主成分") dfc = pd.DataFrame(con) dfc.columns = ["寄与率"] dfc.index = index print(dfc) #固有ベクトル
plt.show() all_images_rgb = ld_images("img") all_images_hog = [] for i in range(len(all_images_rgb)): all_images_rgb[i] = cv2.cvtColor(all_images_rgb[i], cv2.COLOR_BGR2RGB) fd, hog_image = hog(all_images_rgb[i], orientations = 32, pixels_per_cell = (16, 16), cells_per_block = (1,1), visualize = True, multichannel = True) all_images_hog.append(fd) train_anno = sio.loadmat('train-anno.mat') face_landmark = train_anno['face_landmark'] trait_annotation = train_anno['trait_annotation'] total_features = np.c_[all_images_hog, face_landmark] total_features = minmax_scale(total_features, axis = 0) thresholds = np.mean(trait_annotation, axis = 0) trait_labels = np.array([[1 if x >= 0 else -1 for x in trait_annotation[:,i]] for i in range(trait_annotation.shape[1])]) trait_labels = trait_labels.T division = int(0.8 * trait_labels.shape[0]) train_data = total_features[:division,] train_reg = trait_annotation[:division,] train_labels = trait_labels[:division,] test_data = total_features[division:,] test_reg = trait_annotation[division:,] test_labels = trait_labels[division:,] c_range = 2**np.linspace(-5,13,10) p_range = 2**np.linspace(-9,1,6)
def normalizeScale(image, low, high): image = image shape = np.shape(image) newImage = minmax_scale(image.ravel(), feature_range=(low, high)).reshape(shape) return newImage
def run_palantir( ms_data, early_cell, terminal_states=None, knn=30, num_waypoints=1200, n_jobs=-1, scale_components=True, use_early_cell_as_start=False, max_iterations: int = 25, ): """Function for max min sampling of waypoints :param ms_data: Multiscale space diffusion components :param early_cell: Start cell for pseudotime construction :param terminal_states: List/Series of user defined terminal states :param knn: Number of nearest neighbors for graph construction :param num_waypoints: Number of waypoints to sample :param n_jobs: Number of jobs for parallel processing :param scale_components: :param use_early_cell_as_start: :param max_iterations: Maximum number of iterations for pseudotime convergence :return: PResults object with pseudotime, entropy, branch probabilities and waypoints """ if scale_components: data = pd.DataFrame( preprocessing.minmax_scale(ms_data), index=ms_data.index, columns=ms_data.columns, ) else: data = copy.copy(ms_data) # ################################################ # Determine the boundary cell closest to user defined early cell dm_boundaries = pd.Index(set(data.idxmax()).union(data.idxmin())) dists = pairwise_distances(data.loc[dm_boundaries, :], data.loc[early_cell, :].values.reshape(1, -1)) start_cell = pd.Series(np.ravel(dists), index=dm_boundaries).idxmin() if use_early_cell_as_start: start_cell = early_cell # Sample waypoints print("Sampling and flocking waypoints...") start = time.time() # Append start cell if isinstance(num_waypoints, int): waypoints = _max_min_sampling(data, num_waypoints) else: waypoints = num_waypoints waypoints = waypoints.union(dm_boundaries) if terminal_states is not None: waypoints = waypoints.union(terminal_states) waypoints = pd.Index(waypoints.difference([start_cell]).unique()) # Append start cell waypoints = pd.Index([start_cell]).append(waypoints) end = time.time() print("Time for determining waypoints: {} minutes".format( (end - start) / 60)) # pseudotime and weighting matrix print("Determining pseudotime...") pseudotime, W = _compute_pseudotime(data, start_cell, knn, waypoints, n_jobs, max_iterations) # Entropy and branch probabilities print("Entropy and branch probabilities...") ent, branch_probs = _differentiation_entropy(data.loc[waypoints, :], terminal_states, knn, n_jobs, pseudotime) # Project results to all cells print("Project results to all cells...") branch_probs = pd.DataFrame( np.dot(W.T, branch_probs.loc[W.index, :]), index=W.columns, columns=branch_probs.columns, ) ent = branch_probs.apply(entropy, axis=1) # UPdate results into PResults class object res = PResults(pseudotime, ent, branch_probs, waypoints) return res
#threshold values for 2 classes [-0.0317 -0.0132] #print (thresholds.values()) for av in all_av: un_nm_scores = [] for tc in labels[:len(trained_classes)]: clf = isolation_forests[tc] un_nm_scores.append(clf.decision_function(av.reshape(1,-1))[0]) all_unnormalized_scores.append(un_nm_scores) #add thresholds all_unnormalized_scores.append(thresholds.values()) from sklearn.preprocessing import minmax_scale f = minmax_scale(np.array(all_unnormalized_scores)) scores = f[:-1] thres = f[-1] iso_pred = [] print("scores", scores[:10]) for v in scores: # temp = [] thres_max = {} for i, s in enumerate(v): if s > thres[i]: thres_max[i] = s - thres[i] if len(thres_max) == 0: iso_pred.append(120) else: iso_pred.append(max(thres_max, key=thres_max.get))
# https://www.epfl.ch/labs/mmspg/research/page-58317-en-html/page-58332-en-html/page-58333-en-html/iqa/ import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import minmax_scale pd.set_option('display.max_rows', 1000) pd.set_option('display.max_columns', 1000) pd.set_option('display.width', 1000) df = pd.read_csv('JPEGXR.VQMT.csv') df['mos'] = minmax_scale(df['mos'], feature_range=(1,5), axis=0) minMOS = df['mos'].min() maxMOS = df['mos'].max() series = np.linspace(maxMOS, minMOS, 20) results = pd.DataFrame() for i in series: filtered = df[['mos', 'DSSIM', 'SSIMULACRA', 'Butteraugli', 'Butteraugli_XL', 'Butteraugli_XL_3m', 'Butteraugli_XL_2s', 'Butteraugli_XL_3s', 'Butteraugli_XL_6s', 'Butteraugli_XL_12s']][df['mos'] >= i] results = results.append(filtered.corr('spearman')[['mos']].T.reset_index(), ignore_index=True, sort=False) results.index = series results = results.drop(['mos', 'index'], axis=1) results = results.dropna(thresh=1) print(results) plt.figure(figsize=(1920/96, 1080/96), dpi=96) #plt.plot(results.index, results['DSSIM'], label='DSSIM')
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import minmax_scale # 정규화 from sklearn.metrics import mean_squared_error import tensorflow as tf # 1. data load boston = load_boston() print(boston) # "data", "target" # 2. 변수 선택 X = boston.data # 정규화 y = boston.target X.shape # (506, 13) y_nor = minmax_scale(y) # train/test split(70 vs 30) x_train, x_test, y_train, y_test = train_test_split( X, y_nor, test_size=0.1, random_state=123) tf.random.set_seed(123) # 2. Model 클래스 : model = input * w + b class Model(tf.keras.Model): # keras Model class 상속 def __init__(self): # 생성자 super(Model, self).__init__() self.w = tf.Variable(tf.random.uniform(shape=[13, 1])) #type(self.w) # dtype=float32 self.b = tf.Variable(tf.zeros(shape=[1]))
'yeast_net_genes.csv'), header=None) df = pd.read_csv(os.path.join(os.path.expandvars('$AGAPEDATA'), 'pombeallpairs..genexp.txt'), sep='\t') df = df.dropna() cols = ['Gene1', 'Gene2'] for c in cols: df[c] = df[c].apply(lambda x: f'4896.{x}.1') string_proteins = string_indices[0].values df = df[(df.Gene1.isin(string_proteins)) & (df.Gene2.isin(string_proteins))] mapping = dict(zip(string_indices[0], string_indices[1])) for c in cols: df[c] = df[c].apply(lambda x: mapping[x]) df = df.sort_values(by=cols) df.Expression_correlation = minmax_scale(df.Expression_correlation) df.to_csv(os.path.join(output_path, f"yeast_z_gene_expression_meta-analysis_adjacency.txt"), sep="\t", index=False, header=False)
driver = ph.DriverEstim()(eda) # driver.plot() # plt.show() phasic, tonic, _ = ph.PhasicEstim(delta=0.02)(driver) # phasic.plot() # tonic.plot() # plt.show() #print(eda.shape) M = 20 splitArr = np.array_split(eda, eda.shape[0] / M) res = np.array([item.mean() for item in splitArr]) res = minmax_scale(res) #print(res) #print(res.shape[0]) #aM, l5, numAM, numL5 = getMoments(res) segs = np.array_split(res, segements) std = np.std(res) #print(segs) file = open(base + ".txt", "w+") plt.plot(res) plt.plot(segs[0], 'r') plt.show()
from pylab import * import pandas as pd from sklearn.preprocessing import minmax_scale from sklearn.ensemble import RandomForestClassifier train = pd.read_csv('train.csv', index_col='Id') X = minmax_scale(train.ix[:, :-1]) y = array(train.ix[:, -1]) test = pd.read_csv('test.csv', index_col='Id') A = minmax_scale(test) f = RandomForestClassifier(n_estimators=1000, n_jobs=12) f.fit(X, y) p = f.predict(A) s = pd.read_csv('sampleSubmission.csv', index_col='Id') s.Cover_Type = p s.to_csv('sampleSubmission.csv') # score=0.74480 using RandomForestClassifier; 0.58160 2yrs ago # scaling to [0,1] made some difference # NN from sklearn.neural_network import MLPClassifier f = MLPClassifier(hidden_layer_sizes=(80, 20), alpha=1e-5, random_state=1) f.fit(X, y) p = f.predict(A) s = pd.read_csv('sampleSubmission.csv', index_col='Id') s.Cover_Type = p
# In[127]: N_train = 1331 training_inputs = data[0:N_train, :, :].astype('float32') val_inputs = data[(N_train + 1):-1, :, :].astype('float32') training_inputs = training_inputs.reshape( (len(training_inputs), np.prod(training_inputs.shape[1:]))) val_inputs = val_inputs.reshape( (len(val_inputs), np.prod(val_inputs.shape[1:]))) # In[128]: from sklearn.preprocessing import minmax_scale training_inputs = minmax_scale(training_inputs, feature_range=(0, 1), axis=1) val_inputs = minmax_scale(val_inputs, feature_range=(0, 1), axis=1) training_targets = targets[0:N_train, :] val_targets = targets[(N_train + 1):-1, :] t_targets = target_arr[0:N_train, :] v_targets = target_arr[(N_train + 1):-1, :] # In[129]: percent_noisy = 0.5 indices_tozero = np.random.choice( range(training_inputs.shape[0] * training_inputs.shape[1]), int(percent_noisy * training_inputs.shape[0] * training_inputs.shape[1]), replace=False) training_inputs_noisy = training_inputs.copy() np.put(training_inputs_noisy, indices_tozero, 0)
def main(time_points): t0 = time.time() #np.random.seed(seed=2018) #random.seed(2018) print("Number of time_Points is " + str(time_points)) ########################################################################################################### ## These hyperparameters bounds how many slices is extracted from each meter, min and max number ########## ############################################################################################################# number_of_training_slices = 200 min_number_of_training_slices = 1 number_of_test_slices = 200 min_number_of_test_slices = 200 zeros_slice_percentage = 0.2 # Percentage of zeros that is allowed for each slice ########################################################################## if number_of_test_slices == min_number_of_test_slices: # Set to true if min = max in number of test slices it will balance the meters testing_voting = True else: testing_voting = False # Change to output path of "Excelmerge.py" path = "~\Concatenated_File_total.csv" #Hard Threshholding for removing outliers df = pd.read_csv(path, sep=';', header=None) data = df.values shape = np.shape(data) # Shuffeling the data if (False): # COunting nans print("Counting NanS") nan_percentage = [] color_array = [] for i in range(np.shape(data)[1]): nan_percentage.append(count_nans(data[:, i])) color_array.append(1 / data[0, i]) print(" Counting: " + str(i)) x = np.arange(0, i + 1) plt.scatter(x, nan_percentage, c=color_array) plt.xlabel(" Meter ") plt.ylabel(" Missing Value Quotient ") plt.show() data = shuffle_data(data) print("Number of Time Series is:" + str(np.shape(data)[1])) print(np.any(np.isnan(data))) max_length = np.shape(data)[0] number_of_classes = len(np.unique(data[0, :])) if number_of_classes == 5: class_names = [" 1 ", " 2 ", "3", "4", " 5 "] else: class_names = [" 1", " 2 ", "3", " 4 "] training_percentage = 0.8 threshhold = 1e16 # To remove outliers indices = data[:, :] < threshhold data[indices == False] = -1 data_test = data[:, int(training_percentage * shape[1]): shape[1]] # For writing the test-time series to file # Training Data y = data[0, 0:int(training_percentage * shape[1])] X = data[1:shape[0], 0:int(training_percentage * shape[1])] # Test Data # Shall be written directly to file or not, not X_test = data[1:max_length, int(training_percentage * shape[1]):shape[1]] y_test = data[0, int(training_percentage * shape[1]):shape[1]] ########################################################################################################## ############# Next we create the chunks of size time_points which will be stored as columns in ########### ############# X_resampled with corresponding label in y_resampled FOR TRAINING SET ########### ############################################################################################333 number_of_chunks_per_customer = [] X_resampled = [] # Stores the data y_resampled = [] # Stores the label meter_resampled = [] # Stores the meter value (Ranging from 0-shape[1]) left_out_slice = [] booleans = X[:, :] >= 0 booleans2 = X[:, :] == 0 boolvec = [] for j in range(np.shape(X)[1]): # Looping over the samples label = y[j] count = 0 for i in range( int(np.shape(X)[0] / time_points) ): # Looping over the number of chunks assuming no overlap if np.all(booleans[i*time_points:i*time_points+time_points,j]) == True and \ np.sum(booleans2[i*time_points:i*time_points+time_points,j]) <= time_points*zeros_slice_percentage and\ np.var(X[i*time_points:i*time_points+time_points,j]) > 1e-3: var = np.var( X[i * time_points:i * time_points + time_points, j]) # Adding some augmentation to the data random noise X_resampled.append( X[i * time_points:i * time_points + time_points, j] ) #+ np.random.normal(0,np.sqrt(var),np.shape(X[i*time_points:i*time_points+time_points,j]))) y_resampled.append(label) meter_resampled.append(j) count = count + 1 elif np.sum(booleans[i*time_points:i*time_points+time_points,j]) > 0.8*time_points and \ np.sum(booleans2[i*time_points:i*time_points+time_points,j]) <= time_points*zeros_slice_percentage and \ np.var(X[i*time_points:i*time_points+time_points,j]) > 1e-3: # Filling NaN's with interpolation if the number of NaN's per slice is below some limit out = interp(X[i * time_points:i * time_points + time_points, j]) X_resampled.append(out) y_resampled.append(label) meter_resampled.append(j) count = count + 1 number_of_chunks_per_customer.append(count) ####################### Creates dataset of slices for test dataset ############################### number_of_chunks_per_customer_test = [] X_resampled_test = [] y_resampled_test = [] meter_resampled_test = [] booleans_test = X_test[:, :] >= 0 booleans2_test = X_test[:, :] == 0 for j in range(np.shape(X_test)[1]): # Looping over the samples label_test = y_test[j] count = 0 for i in range( int(np.shape(X_test)[0] / time_points) ): # Looping over the number of chunks assuming no overlap if np.all(booleans_test[ i * time_points:i * time_points + time_points, j]) == True and np.sum(booleans2_test[ i * time_points:i * time_points + time_points, j]) <= time_points * zeros_slice_percentage: if np.var( X_test[i * time_points:i * time_points + time_points, j] ) > 1e-3: # Avoid having all the same inputs, want to capture some patterns X_resampled_test.append( X_test[i * time_points:i * time_points + time_points, j]) count = count + 1 y_resampled_test.append(label_test) meter_resampled_test.append(j) number_of_chunks_per_customer_test.append(count) #Next we build the matrix of slices X_resampled = np.stack(X_resampled, axis=-1) # Stacked data X_resampled_test = np.stack(X_resampled_test, axis=-1) # Stacked data #Putting together in one big matrix so it is easier to shuffle. X_big = np.concatenate([[y_resampled], X_resampled], axis=0) X_big = np.concatenate([[meter_resampled], X_big], axis=0) X_big_test = np.concatenate([[y_resampled_test], X_resampled_test], axis=0) X_big_test = np.concatenate([[meter_resampled_test], X_big_test], axis=0) # Shuffle X_big X_big = shuffle_data(X_big) meter_range = int(np.max(np.unique(X_big[0, :]))) meter_range_test = int(np.max(np.unique(X_big_test[0, :]))) # Create Loop for taking max number of samples from each meter X_resampled = [[] for _ in range(meter_range + 1)] X_resampled_test = [[] for _ in range(meter_range_test + 1)] count_meter = np.zeros(meter_range + 1) # Number of meters in training set count_meter_test = np.zeros(meter_range_test + 1) count_meter_class = [ np.zeros(meter_range_test + 1) for _ in range(number_of_classes) ] for i in range(np.shape(X_big)[1]): # Go through every sample index = int(X_big[0, i]) # Converts Float to integer for indexing if count_meter[ index] < number_of_training_slices: # checking if we have no more than max_number of slices of meter index. X_resampled[index].append(X_big[:, i]) count_meter[index] = count_meter[ index] + 1 # Keeps track of number of samples extracted from each meter print(len(X_resampled)) for i in range(np.shape(X_big_test)[1]): index = int(X_big_test[0, i]) classtmp = int(X_big_test[1, i]) - 1 if count_meter_test[index] < number_of_test_slices: X_resampled_test[index].append(X_big_test[:, i]) count_meter_test[index] = count_meter_test[index] + 1 count_meter_class[classtmp][ index] = count_meter_class[classtmp][index] + 1 del (X_big) del (X_big_test) # Want to create a indgurk2 for each classes separately and make histograms of that, # This will show the distribution of the number of slices over the meters for each class # Gives " Quality measure " for the meters in each class gurk1 = np.asarray(np.where(count_meter < min_number_of_training_slices)) gurk2 = np.asarray(np.where(count_meter_test < min_number_of_test_slices)) for k in range(number_of_classes): rem = np.where(count_meter_class[k] >= min_number_of_test_slices) indgurk2 = count_meter_test[rem] plt.hist(indgurk2) plt.title("Class: " + class_names[k]) plt.xlabel("Number of Slices") plt.ylabel("Number of Meters") #plt.show() ######################################################################### # Delete the lists i.e meters containing less than min number of slices # ######################################################################### for i in sorted(np.squeeze(gurk1).tolist(), reverse=True): del (X_resampled[i]) for i in sorted(np.squeeze(gurk2).tolist(), reverse=True): del (X_resampled_test[i]) X_big = np.stack(list(itertools.chain.from_iterable(X_resampled)), axis=-1) if (testing_voting == False): X_big_test = np.stack(list( itertools.chain.from_iterable(X_resampled_test)), axis=-1) ##### Need to sort X_big_test by classes in the case where min = max: if (testing_voting == True): labels = [] # Assuming now that each meter has the sane number of slices we will find the class with the fewest number of meters for k in range(len(X_resampled_test)): labels.append( X_resampled_test[k][0][1] ) #[k]: meter [0] take first slice [1] class of slices, same for all slices in meter k # Labels contain the labels for all the meters in the test set. a, return_index, return_counts = np.unique(labels, return_index=True, return_counts=True) # Return counts will give us the smallest number of meters in a class. min_number_of_samples_test = np.max(return_counts) print( "The number of samples in each class in test set is distributed as follows" ) print(return_counts) # Go through X_resampled_test again and remove meters from the not smallest class until the dataset is balanced count_classes = np.zeros(number_of_classes) for k in sorted(np.arange(0, len(X_resampled_test)), reverse=True): class_index = int(X_resampled_test[k][0][1]) - 1 if count_classes[class_index] >= min_number_of_samples_test: del (X_resampled_test[k]) count_classes[class_index] = count_classes[class_index] + 1 # Need to add to count_classes[index]!!! data_shuffeled_test = np.stack(list( itertools.chain.from_iterable(X_resampled_test)), axis=-1) # Sorting the datamatrices in order to take same number of samples from each class X_big = X_big[:, np.argsort(X_big[1, :])] print(X_big) a, return_index, return_counts = np.unique(X_big[1, :], return_index=True, return_counts=True) print("There are a total of " + str(np.shape(X_big)[1]) + " samples distributed as follows.") print(return_counts) print(return_index) print() print() if (testing_voting == False): X_big_test = X_big_test[:, np.argsort(X_big_test[1, :])] a_test, return_index_test, return_counts_test = np.unique( X_big_test[1, :], return_index=True, return_counts=True) print( "The number of samples in each class in test set is distributed as follows" ) print(return_counts_test) print(return_index_test) print() min_number_of_samples_test = np.min(return_counts_test) data_shuffeled_test = X_big_test #Building final dataset of min_number_of_samples per class, randomly sampled and shuffeled min_number_of_samples = np.min( return_counts) # The number of samples from the smallest class indexvector = np.arange(0, np.shape(X_big)[1]) # Indices of all samples data_shuffeled = np.ones([ time_points + 2, number_of_classes * min_number_of_samples ]) #Preallocate memory for final data array. for i in range(number_of_classes): #Picks min_number_of_samples random indices from each class and puts them in the final data array. indexchoice = np.random.choice(indexvector[return_index[i]:return_index[i] + return_counts[i]],\ min_number_of_samples,replace=False) data_shuffeled[:,i*min_number_of_samples:(i+1)*min_number_of_samples] = \ X_big[:,indexchoice] print(np.any(np.isnan(data_shuffeled))) print(np.any(np.isnan(data_shuffeled_test))) # For checking that this works a2, return_index2, return_counts2 = np.unique(data_shuffeled[1, :], return_index=True, return_counts=True) a2_test, return_index2_test, return_counts2_test = np.unique( data_shuffeled_test[1, :], return_index=True, return_counts=True) print("The number of samples in each class is " + str(min_number_of_samples) + ".") print(return_counts2) print(return_index2) print() print() print("The number of samples in each class for Test set is " + str() + ".") print(return_counts2_test) print(return_index2_test) print() print() # Normalization data_shuffeled[2:np.shape(data_shuffeled)[0],:] = \ minmax_scale(data_shuffeled[2:np.shape(data_shuffeled)[0],:],feature_range = (0,1),axis = 0,copy=False) data_shuffeled_test[2:np.shape(data_shuffeled_test)[0],:] = \ minmax_scale(data_shuffeled_test[2:np.shape(data_shuffeled_test)[0],:],feature_range=(0,1),axis=0,copy=False) print("Check for NaN's after feature scaling") print(np.any(np.isnan(data_shuffeled))) print(np.any(np.isnan(data_shuffeled_test))) # Final shuffle of the training data data_shuffeled = shuffle_data(data_shuffeled) data_shuffeled_test = shuffle_data(data_shuffeled_test) # Extract final data, write to csv y_one_hot = np.zeros([number_of_classes, np.shape(data_shuffeled)[1]]) for k in range(np.shape(data_shuffeled)[1]): for p in range(number_of_classes): if p + 1 == data_shuffeled[1, k]: y_one_hot[p, k] = 1 y_one_hot_test = np.zeros( [number_of_classes, np.shape(data_shuffeled_test)[1]]) for k in range(np.shape(data_shuffeled_test)[1]): for p in range(number_of_classes): if p + 1 == data_shuffeled_test[1, k]: y_one_hot_test[p, k] = 1 # Write processed data into a file. filename = "~\Final_Data" + str(time_points) if os.path.exists( filename + ".csv" ): #Checks if file exists and if true removes this makes sure it overwrites it. print("Removing old " + filename + ".csv" + " before writing new.") os.remove(filename + ".csv") np.savetxt(filename + ".csv", data_shuffeled, delimiter=';') # Prints data to file print("Final_Data Length is " + str(np.shape(data_shuffeled)[1])) filename_y_one_hot = "~\Y_One_Hot" + str(time_points) if os.path.exists( filename_y_one_hot + ".csv" ): #Checks if file exists and if true removes this makes sure it overwrites it. print("Removing old " + filename_y_one_hot + ".csv" + " before writing new.") os.remove(filename_y_one_hot + ".csv") np.savetxt(filename_y_one_hot + ".csv", y_one_hot, delimiter=';') # Prints data to file print("Final Test Data Length is " + str(np.shape(data_shuffeled_test)[1])) filename_test = "~\Test_Data" + str(time_points) if os.path.exists(filename_test + ".csv"): print("Removing old " + filename_test + ".csv" + " before writing new.") os.remove(filename_test + ".csv") np.savetxt(filename_test + ".csv", data_shuffeled_test, delimiter=';') filename_y_one_hot_test = "~\Y_One_Hot_Test" + str(time_points) if os.path.exists( filename_y_one_hot_test + ".csv" ): #Checks if file exists and if true removes this makes sure it overwrites it. print("Removing old " + filename_y_one_hot_test + ".csv" + " before writing new.") os.remove(filename_y_one_hot_test + ".csv") np.savetxt(filename_y_one_hot_test + ".csv", y_one_hot_test, delimiter=';') # Prints data to file t1 = time.time() print("Code ran in:" + str(np.round((t1 - t0) / 60, decimals=3)) + " minutes.") return 0
a,sr=librosa.load(path) a=a[:100000]#サイズ合わせ y=librosa.feature.mfcc(y=a,sr=sr) data.append(y) label.append(0) for i in range(10): path="フォルダ名"+str(i+1)+"拡張子名" a,sr=librosa.load(path) a=a[:100000]#サイズ合わせ y=librosa.feature.mfcc(y=a,sr=sr) data.append(y) label.append(1) for i in range(len(data)): data[i]=sum(data[i])/len(data[i]) data = preprocessing.minmax_scale(data) #データの分割 x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3) #学習 clf=clf=MLPClassifier(hidden_layer_sizes=(500,500,250)) clf.fit(x_train, y_train) #予測 y_pred=clf.predict(x_test) #精度の検証 print("Accuracy") print(accuracy_score(y_test,y_pred)) print("Precision")
def normalize(dataset): return preprocessing.minmax_scale(dataset, feature_range=(0, 1))
# Encode the result encoder = LabelEncoder() source['Result'] = encoder.fit_transform(source['Result']) print(source['Result'].unique()) # Shows that all the values are encoded # Convert training data to numeric source['Percentage'] = source['Percentage'].apply( pd.to_numeric, errors='coerce') #coerce would change the non numeric to NaN source['Percentage'].fillna(round(source['Percentage'].mean(), 2), inplace=True) # Change NaN to mean value # Feature Scaling #max = source['Percentage'].max() source['GRE'] = preprocessing.minmax_scale(source['GRE'], feature_range=(0, 1)) source['GRE (Quants)'] = preprocessing.minmax_scale(source['GRE (Quants)'], feature_range=(0, 1)) source['AWA'] = preprocessing.minmax_scale(source['AWA'], feature_range=(0, 1)) source['TOEFL'] = preprocessing.minmax_scale(source['TOEFL'], feature_range=(0, 1)) source['Work-Ex'] = preprocessing.minmax_scale(source['Work-Ex'], feature_range=(0, 1)) source['International Papers'] = preprocessing.minmax_scale( source['International Papers'], feature_range=(0, 1)) source['Percentage'] = preprocessing.minmax_scale(source['Percentage'], feature_range=(0, 1)) # Test Train Split features = [ 'GRE', 'GRE (Quants)', 'AWA', 'TOEFL', 'Work-Ex', 'International Papers',
key = input() if mode == 0: # 録音開始 print('===== {0} START ==============='.format(cnt)) record.record_start.set() record.record_end.clear() mode = 1 else: # 録音終了 print('===== END ===============') record.record_start.clear() while not record.record_end.is_set(): pass mode = 0 cnt += 1 x1 = [] x2 = [] wav, fs = librosa.load('tmp/voice.wav', sr=8000) context_feature = librosa.feature.mfcc(wav, sr=fs, hop_length=10**6, htk=True).T[0] mfcc = librosa.feature.mfcc(wav, sr=fs, n_mfcc=32).T for frame in mfcc: x1.append(preprocessing.minmax_scale(frame)) x2.append(context_feature) pred = [classes[np.argmax(p)] for p in model.predict([x1, x2])] print(pred)
def scale_min_max(a: np.array): return minmax_scale(a)
############################## kidtx = pd.read_csv( 'GBMLGG_new_time_methylation_mRNA10_all_lmqcm_gamma=none_minClusterSize=10_202012024_2.csv' ) dataX1 = kidtx.drop(["Unnamed: 0", "ID", "vital_status", "days"], axis=1).values #y = np.transpose(np.round(np.array(kidtx["days"]/30),2), kidtx["vital_status"]) # V1=time; erged_data33=status [m0, n0] = dataX1.shape dataX = np.asarray(dataX1) #dataX =minmax_scale(dataX ) data_mRNA = dataX1[:, 0:36] data_methylation = dataX1[:, 36:n0] #dataX=data_mRNA #dataX=data_methylation dataX = minmax_scale(dataX) [m, n] = dataX.shape [m1, n1] = data_methylation.shape [m2, n2] = data_mRNA.shape dataX = dataX.reshape(m, 1, n) x = dataX data_methylation = data_methylation.reshape(m1, 1, n1) data_mRNA = data_mRNA.reshape(m2, 1, n2) ytime = np.round(np.array(kidtx["days"] / 30), 1) #np.transpose(np.array(kidtx["days"]))/30 # only V1=time; ystatus = np.transpose(np.array( kidtx["vital_status"])) #only erged_data33=status y = np.transpose([ytime, ystatus])
('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) def create_axes(title, figsize=(16, 6)): fig = plt.figure(figsize=figsize) fig.suptitle(title) # define the axis for the first plot left, width = 0.1, 0.22 bottom, height = 0.1, 0.7 bottom_h = height + 0.15 left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.1] rect_histy = [left_h, bottom, 0.05, height]