def quantile_norm(dfs: [pd.DataFrame] or pd.DataFrame): """Perform quantile normalization on expression data. Quantile normalize a list of DataFrames using sklearn.Preprocessing.quantile_transform with default n_quantiles=1000. The DataFrame(s) passed should contain samples from the same batch. Each DataFrame should be formatted with columns as samples and rows as genes. TODO: as of now the function assumes the gene order and number are the same across samples. This should change Args: dfs: a DataFrame or list of DataFrames from the same batch Returns: a dataframe containing """ transformed = list() if isinstance(dfs, list): vals = [list(i.columns.values) for i in dfs] index = dfs[0].index.tolist() for i in range(len(dfs)): transformed.append( pd.DataFrame(quantile_transform(dfs[i], axis=0), columns=vals[i], index=index, output_distribution='normalk')) else: vals = dfs.columns.to_list() index = dfs.index.to_list() transformed = pd.DataFrame(quantile_transform( dfs, axis=0, output_distribution='normal'), columns=vals, index=index) return transformed
def _normalize(self, matrix, key): """ """ if self.verbose: print('normalizing for {0}...'.format(key)) if self.normalization['NB_FEATURES_TO_KEEP']: self.variance_reducer.nb_features = self.normalization[ 'NB_FEATURES_TO_KEEP'] matrix = self.variance_reducer.fit_transform(matrix) if self.normalization['CUSTOM']: custom_norm = self.normalization['CUSTOM']() assert (hasattr(custom_norm, 'fit') and hasattr(custom_norm, 'fit_transform')) matrix = custom_norm.fit_transform(matrix) if self.normalization['TRAIN_MIN_MAX']: matrix = MinMaxScaler().fit_transform(matrix.T).T if self.normalization['TRAIN_MAD_SCALE']: matrix = self.mad_scaler.fit_transform(matrix.T).T if self.normalization['TRAIN_ROBUST_SCALE'] or\ self.normalization['TRAIN_ROBUST_SCALE_TWO_WAY']: matrix = self.robust_scaler.fit_transform(matrix) if self.normalization['TRAIN_NORM_SCALE']: matrix = self.normalizer.fit_transform(matrix) if self.normalization['TRAIN_QUANTILE_TRANSFORM']: matrix = quantile_transform(matrix, **QUANTILE_OPTION) if self.normalization['TRAIN_RANK_NORM']: matrix = RankNorm().fit_transform(matrix) if self.normalization['TRAIN_CORR_REDUCTION']: args = self.normalization['TRAIN_CORR_REDUCTION'] if args == True: args = {} if self.verbose: print('dim reduction for {0}...'.format(key)) reducer = CorrelationReducer(**args) matrix = reducer.fit_transform(matrix) if self.normalization['TRAIN_CORR_RANK_NORM']: matrix = RankNorm().fit_transform(matrix) if self.normalization['TRAIN_CORR_QUANTILE_NORM']: matrix = quantile_transform(matrix, **QUANTILE_OPTION) if self.normalization['TRAIN_CORR_NORM_SCALE']: matrix = self.normalizer.fit_transform(matrix) return np.nan_to_num(matrix)
def get_features_targets(ok_feats): ### success measures city = 'london' outfolder = '../ProcessedData/' + city + '/' successdata = outfolder + 'venues_info/' + city + '_venues_success_measures.csv' success = pd.read_csv(successdata, sep='\t', index_col=0) ### ward level features outfolder = '../ProcessedData/' + city + '/' ward_stats_f = outfolder + 'venues_info/venues_ward_full.dat' ward_stats = pd.read_csv(ward_stats_f, sep='\t', index_col=0).drop(['ward'], axis=1) ### category features ward_cats_f = outfolder + 'venues_info/' + city + '_WARD_category_stats.csv' ward_cats = pd.read_csv(ward_cats_f, sep='\t', index_col=0).drop(['ward'], axis=1) ### network features network_meas_f = outfolder + '/networks/' + city + '_COMBINED_networkmeasures.csv' network_meas = pd.read_csv(network_meas_f, sep=',', index_col=0).replace([np.inf, -np.inf], np.nan).fillna(0.0) # merge and filter full feature set temp_features = network_meas.join(ward_cats) venue_features = temp_features.join(ward_stats) filtered_features = pd.DataFrame() for feat in ok_feats: filtered_features[feat] = venue_features[feat] for feat in ok_feats: filtered_features = filtered_features[filtered_features[feat] != 0.0] # scale features and targets X_head = filtered_features.keys() X = filtered_features.values X = StandardScaler().fit_transform(X) X = preprocessing.quantile_transform(X, output_distribution='normal') success_t = success.reset_index() success_t = success_t[success_t['index'].isin(list( filtered_features.index))] y = np.asarray(success_t['checkinsCount']) y = StandardScaler().fit_transform(y.reshape(-1, 1)) y = preprocessing.quantile_transform(y, output_distribution='normal') yy = np.asarray(success_t['tipCount']) yy = StandardScaler().fit_transform(y.reshape(-1, 1)) yy = preprocessing.quantile_transform(y, output_distribution='normal') return X, y, X_head, yy
def transform_data(count_data, transformation_method,\ mean_center=False,\ std_unit=False,\ return_instance=False,\ coef=None): transformation_method = transformation_method or '' if transformation_method.lower() == 'log': transformed_data = np.log(count_data + 1) elif transformation_method.lower() == 'voom': transformed_data = np.log(count_data) elif transformation_method.lower() == 'anscombe': transformed_data = np.sqrt(count_data + 3. / 8.) elif transformation_method.lower() == 'quantile': transformed_data = quantile_transform(count_data, output_distribution='normal') else: print('ERROR: not an available transformation method') transformed_data = count_data # Create NormalizationParameter instance for further processing. coef = coef or NormalizationParameter() if coef.scaler is None: coef.scaler = StandardScaler(with_mean=mean_center, with_std=std_unit) coef.scaler.fit(transformed_data) if return_instance: return coef.scaler.transform(transformed_data), coef else: return coef.scaler.transform(transformed_data)
def load_data(tag): """ 从数据库加载数据 """ # 连接数据库 conn = sqlite3.connect('Data/StackExpert.sqlite') # 加载所有数据 data = pd.read_sql_query( """ SELECT * FROM {:s} """.format(tag), conn) conn.close() # 数据预处理 data = pre_proc(data) # 依照专家用户的判断标准设定用户标签 target = column_or_1d( binarize(pd.DataFrame(data.pop('EXPERT_SCORE')), threshold=99, copy=False)).astype(int) col = data.columns ind = data.index # 数据标准化归一化 data = quantile_transform(data, copy=False, output_distribution='normal') data = pd.DataFrame(maxabs_scale(data), index=ind, columns=col) # 计算专家/非专家比例 cnt = np.bincount(target) target = pd.Series(target, index=ind) return data, target, np.divide(np.min(cnt), np.max(cnt))
def transform(self, X): """ Perform the quantile mapping. Parameters ---------- X : array_like, shape [n_samples, n_features] Samples. """ X = ensure_samples_features(X) # maybe detrend the datasets if self.detrend: x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X) x_to_cdf = x_trend.transform(X) else: x_to_cdf = X # do the final mapping qt_kws = self.qt_kwargs.copy() if "n_quantiles" not in qt_kws: qt_kws["n_quantiles"] = len(X) x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws) x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles) # add the trend back if self.detrend: x_qmapped = x_trend.inverse_transform(x_qmapped) return x_qmapped
def generate_arr_heatmap(ax, arr, cmap=None, constrain=(-1, 1)): # remove outliers betas = [0, 1, 2, 4, 8, 16, 32, 64] caps = [0, 4, 8, 12, 16, 20] scaled = arr.copy() # sklearn preprocessing works only per axis dimension scaled = scaled.reshape((-1, 1)) scaled = quantile_transform(scaled, n_quantiles=9, output_distribution='uniform') scaled /= 3 scaled = scaled.reshape((len(betas), len(caps))) # remove text longer than four characters text = arr.astype('str') for i in range(text.shape[0]): for j in range(text.shape[1]): # beta == 0 if i == 0 and j > 0: text[i][j] = '' else: text[i][j] = format_text(arr[i][j]) # plot plot_funcs.plot_heatmap(ax, heatmap=scaled, row_labels=None, set_row_labels=False, col_labels=[f'$C={c}$' for c in caps], set_col_labels=True, text=text, constrain=constrain, cmap=cmap, grid_fontsize='xx-small')
def clean_anat(in_file, save_dir): ### Load brain ### brain = load_numpy_brain(in_file) ### Blur brain and mask small values ### brain_copy = brain.copy().astype('float32') brain_copy = scipy.ndimage.filters.gaussian_filter(brain_copy, sigma=10) threshold = triangle(brain_copy) brain_copy[np.where(brain_copy < threshold / 2)] = 0 ### Remove blobs outside contiguous brain ### labels, label_nb = scipy.ndimage.label(brain_copy) brain_label = np.bincount(labels.flatten())[1:].argmax() + 1 brain_copy = brain.copy().astype('float32') brain_copy[np.where(labels != brain_label)] = np.nan ### Perform quantile normalization ### brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1), n_quantiles=500, random_state=0, copy=True) brain_out = brain_out.reshape(brain.shape) np.nan_to_num(brain_out, copy=False) ### Save brain ### fname = in_file.split('/')[-1].split('.')[0] save_file = os.path.join(save_dir, f'{fname}_clean.nii') aff = np.eye(4) img = nib.Nifti1Image(brain_out, aff) img.to_filename(save_file)
def quantiles(l): ''' Converts a list/series of values to quantiles of same shape. sklearn.quantile_transform() requires data to be reshaped ''' qt = quantile_transform(X=l.as_matrix().reshape(-1, 1), n_quantiles=100) return qt.reshape(1, -1)[0]
def preprocess(self, standard = True, quantile = True, n_quantiles = 1000): """ A function for standardizing the data set as the final part of the preprocessing. Parameters: standard: bool, if True sklearn.StandardScaler is applied to the data set quantile: bool, if True sklearn.quantile_transform is applied to the data with quantiles as given by n_quantiles n_quantiles: int, the number of qunatiles to use for the quantile transformer Returns: color_frame: pandas DataFrame, the standardized photometric data set labels: numpy array, including either ['UNK', 'QSO', 'GALAXY', 'STAR', 'BAL'] for each object obj_names: numpy array, includer the name of each object scaler: str, names of the applied preprocessing algorithms """ self.scaler = '' if standard: self.scaler += 'SS' scaler = StandardScaler(copy=True) self.color_frame = pd.DataFrame(scaler.fit_transform(self.color_frame), columns = self.color_frame.columns) if quantile: self.scaler += 'Quantile' self.color_frame = pd.DataFrame(quantile_transform(self.color_frame, copy=True, n_quantiles=n_quantiles), columns = self.color_frame.columns, ) return self.color_frame, self.labels, self.obj_names, self.scaler
def problem2_4_3(area): QT = preprocessing.QuantileTransformer(random_state=0, n_quantiles=194) area_QT = QT.fit_transform(area) print("Use QuantileTransformer:", plot(area_QT)) qt = quantile_transform(area, n_quantiles=194, random_state=0, copy=True) print("Use quantile_transform:", plot(qt)) return "as shown in the plots"
def main(args): logfile = args['logfile'] directory = args['directory'] # directory will be a full path anat/moco width = 120 printlog = getattr(flow.Printlog(logfile=logfile), 'print_to_log') ### Load brain ### brain = np.asarray(nib.load(os.path.join(directory, file + '.nii')).get_data(), dtype='float32') ### Blur brain and mask small values ### brain_copy = brain.copy().astype('float32') brain_copy = scipy.ndimage.filters.gaussian_filter(brain_copy, sigma=10) threshold = triangle(brain_copy) brain_copy[np.where(brain_copy < threshold / 2)] = 0 ### Remove blobs outside contiguous brain ### labels, label_nb = ndimage.label(brain_copy) brain_label = np.bincount(labels.flatten())[1:].argmax() + 1 brain_copy = brain.copy().astype('float32') brain_copy[np.where(labels != brain_label)] = 0 ### Perform quantile normalization ### brain_3d = quantile_transform(brain_copy.flatten().reshape(-1, 1), n_quantiles=500, random_state=0, copy=True).reshape(brain.shape) ### Save brain ### save_file = os.path.join(directory, file + '_mean.nii') aff = np.eye(4) img = nib.Nifti1Image(meanbrain, aff) img.to_filename(save_file)
def normalizer(dframe, features, train_test=False): # Train-Test split X = dframe.drop(['consume'], axis=1) y = dframe.consume X_qtrans = quantile_transform(X, output_distribution='normal', random_state=0) X_scale = pd.DataFrame(X_qtrans, index=X.index, columns=X.columns) if train_test == True: X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=.2, random_state=42) # Show the distributions after normalization X_train[features].hist(figsize=(12,5)) pl.suptitle("Train set after normalization") X_test[features].hist(figsize=(12,5)) pl.suptitle("Test set after normalization") return X_train, X_test, y_train, y_test elif train_test == False: # Show the distributions after normalization X_scale[features].hist(figsize=(12,5)) pl.suptitle("X set after normalization") return X_scale, y
def transform(self, X): """Perform the quantile mapping. Parameters ---------- X : array_like, shape [n_samples, n_features] Samples. """ # validate input data check_is_fitted(self) # TO-DO: fix validate_data fctn X = self._validate_data(X) # maybe detrend the datasets if self.detrend: x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X) x_to_cdf = x_trend.transform(X) else: x_to_cdf = X # do the final mapping qt_kws = self.qt_kwargs.copy() if 'n_quantiles' not in qt_kws: qt_kws['n_quantiles'] = len(X) x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws) x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles) # add the trend back if self.detrend: x_qmapped = x_trend.inverse_transform(x_qmapped) return x_qmapped
def dt2_de_novo_pred(intron_lengths, jad_labels, is_primary_donor, is_primary_acceptor, donor_lr_score, acceptor_lr_score, is_annot, classifier='decision_tree'): jad_labels = np.asarray(jad_labels) is_primary_donor = np.asarray(is_primary_donor) is_primary_acceptor = np.asarray(is_primary_acceptor) intron_length_quantile = quantile_transform( np.asarray(intron_lengths).reshape(-1, 1)).ravel() X = np.stack([ jad_labels, is_primary_donor, is_primary_acceptor, intron_length_quantile, donor_lr_score, acceptor_lr_score, ], axis=1) y = np.asarray(is_annot, dtype=np.int) pred = _de_novo_pred(X, y, DT2_DENOVO_FEATURES, classifier=classifier) return pred
def transform(self, X): ''' Perform the quantile mapping. Parameters ---------- X : array_like, shape [n_samples, n_features] Samples. ''' X = ensure_samples_features(X) # maybe detrend the datasets if self.detrend: x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X) x_to_cdf = x_trend.transform(X) else: x_to_cdf = X # do the final mapping x_quantiles = quantile_transform(x_to_cdf) x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles) # add the trend back if self.detrend: x_qmapped = x_trend.inverse_transform(x_qmapped) return x_qmapped
def get_propensity_score_bins(ddf, nbins, outcome): classifier = LogisticRegression(C=1.0, penalty='l1') #encode_cats(df_field, 'gender') #encode_cats(df_field, 'location') try: XL = ddf.drop(columns = ['networker', outcome, 'propensity', 'bins']) except: XL = ddf.drop(columns = ['networker', outcome]) XL = StandardScaler().fit_transform(XL) XL = preprocessing.quantile_transform(XL, output_distribution = 'normal') yL = np.asarray(ddf['networker']) classifier.fit(XL, yL) yL_pred_cl = classifier.predict(XL) yL_pred_prob = classifier.predict_proba(XL) df_prop = pd.DataFrame(ddf) df_prop['propensity'] = [p[1] for p in yL_pred_prob] df_prop['bins'] = pd.qcut(df_prop['propensity'], nbins, ['q' + str(i+1) for i in range(nbins)]) df_prop.head() return df_prop, list(set(df_prop.bins))
def dt1_de_novo_pred(intron_motif, intron_lengths, jad_labels, is_primary_donor, is_primary_acceptor, is_annot, motif_regex='GTAG|GCAG|ATAG', classifier='decision_tree'): motif_regex = re.compile(motif_regex) is_canon = np.asarray( [int(bool(motif_regex.match(m))) for m in intron_motif]) jad_labels = np.asarray(jad_labels) is_primary_donor = np.asarray(is_primary_donor) is_primary_acceptor = np.asarray(is_primary_acceptor) intron_length_quantile = quantile_transform( np.asarray(intron_lengths).reshape(-1, 1)).ravel() X = np.stack([ is_canon, jad_labels, is_primary_donor, is_primary_acceptor, intron_length_quantile ], axis=1) y = np.asarray(is_annot, dtype=np.int) pred = _de_novo_pred(X, y, DT1_DENOVO_FEATURES, classifier=classifier) return pred
def main(args): logfile = args['logfile'] directory = args['directory'] # directory will be a full path anat/moco width = 120 printlog = getattr(flow.Printlog(logfile=logfile), 'print_to_log') ### Load brain ### file = os.path.join(directory, 'anat_red_clean.nii') brain = np.asarray(nib.load(file).get_data(), dtype='float32') # renormalize to .3-.7 a = .3 b = .7 brain_input = a + (brain) * (b - a) # sharpen brain_sharp = unsharp_mask(brain_input, radius=3, amount=7) # make background nan brain_copy = brain_sharp.copy() brain_copy[np.where(brain_input < .31)] = np.nan # renormalize via quantile brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1), n_quantiles=500, random_state=0, copy=True) brain_out = brain_out.reshape(brain.shape) np.nan_to_num(brain_out, copy=False) ### Save brain ### save_file = os.path.join(directory, 'anat_red_clean_sharp.nii') aff = np.eye(4) img = nib.Nifti1Image(brain_out, aff) img.to_filename(save_file)
def sharpen_anat(in_file, save_dir): ### Load brain ### #file = os.path.join(directory, 'anat_red_clean.nii') brain = load_numpy_brain(in_file) # renormalize to .3-.7 a = .3 b = .7 brain_input = a + (brain) * (b - a) # sharpen brain_sharp = unsharp_mask(brain_input, radius=3, amount=7) # make background nan brain_copy = brain_sharp.copy() brain_copy[np.where(brain_input < .31)] = np.nan # renormalize via quantile brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1), n_quantiles=500, random_state=0, copy=True) brain_out = brain_out.reshape(brain.shape) np.nan_to_num(brain_out, copy=False) ### Save brain ### fname = in_file.split('/')[-1].split('.')[0] save_file = os.path.join(save_dir, f'{fname}_sharp.nii') aff = np.eye(4) img = nib.Nifti1Image(brain_out, aff) img.to_filename(save_file)
def generate_heatmap(ax, theme, epochs, latent_dim, theme_data, set_row_labels, set_col_labels): # empty numpy array for containing betas = [0, 1, 2, 4, 8, 16, 32, 64] arr = np.full((len(betas), 6), np.nan) # iterate dims, betas, caps for row_idx, beta in enumerate(betas): caps = [0, 4, 8, 12, 16, 20] if beta == 0: caps = [0] for col_idx, cap in enumerate(caps): # key for referencing history data key = f'e{epochs}_d{latent_dim}_b{beta}_c{cap}' loss_dict = theme_data[key] # iterate the ax rows and ax cols # each loss_dict contains the loss keys with numpy values of epochs x seeds # take the argmin across the epoch axis then take the mean of the seeds # arr[row_idx][col_idx] = np.nanmean(np.nanmin(loss_dict[theme], axis=0)) arr[row_idx][col_idx] = np.nanmean(loss_dict[theme][-1]) # remove outliers scaled = arr.copy() # sklearn preprocessing works only per axis dimension scaled = scaled.reshape((-1, 1)) scaled = quantile_transform(scaled, n_quantiles=9, output_distribution='uniform') scaled /= 3 scaled = scaled.reshape((len(betas), len(caps))) # remove text longer than four characters text = arr.astype('str') for i in range(text.shape[0]): for j in range(text.shape[1]): if i == 0 and j > 0: text[i][j] = '' else: text[i][j] = format_text(arr[i][j]) # plot plot_funcs.plot_heatmap( ax, heatmap=scaled, row_labels=[r'$\beta=' + f'{b}$' for b in betas], set_row_labels=set_row_labels, col_labels=[f'$C={c}$' for c in [0, 4, 8, 12, 16, 20]], set_col_labels=set_col_labels, text=text, grid_fontsize='xx-small') if theme == 'val_loss': ax_title = 'Combined loss' elif theme == 'val_rec_loss': ax_title = 'MSE reconstruction' elif theme == 'val_kl': ax_title = r'$D_{KL}$' elif theme == 'val_kl_beta': ax_title = r'$\beta \cdot D_{KL}$' elif theme == 'val_kl_beta_cap': ax_title = r'$\beta \cdot |D_{KL} - C|$' elif theme == 'val_capacity_term': ax_title = 'Capacity term $C$' ax.set_xlabel(ax_title)
def normalize(X, mode, **kwargs): """ NOTE: The axis must be set to 1 for time series so as to standardize each sample that is different from the common spreadsheet data and image data. Therefore, it is dose not has the training process to memorize the statistics calculated on training set. In addition, the Transformer classes realized in sklearn can not be used for time series because they are realized to independently normalize/scale/standardize each feature otherwise each sample. :param X: :param mode: :param kwargs: :return: """ axis = 1 ret = None ## Standardization, or mean removal and variance scaling: # they might behave badly if the individual features do not more or less look like standard # normally distributed data. if mode == 'znorm': ret = skpre.scale(X, axis=axis, **kwargs) elif mode == 'znorm-center': ret = skpre.scale(X, axis=axis, with_std=False, **kwargs) # MinMax, MaxAbs: # The motivation to use this scaling include robustness to very small standard deviations # of features and preserving zero entries in sparse data. elif mode == 'scale-minmax': # [min, max], default is [0, 1] ret = skpre.minmax_scale(X, axis=axis, **kwargs) elif mode == 'scale-maxabs': # [-1, 1] ret = skpre.maxabs_scale(X, axis=axis, **kwargs) # Scaling data with outliers: elif mode == 'scale-robust': ret = skpre.robust_scale(X, axis=axis, **kwargs) ## Non-linear transformation # Quantile transforms maps data to a uniform distribution. It should be noted that such # operation distort correlations and distance within and across features. elif mode == 'nonlinear-quantile': ret = skpre.quantile_transform(X, axis=axis, **kwargs) # Power transforms aim o map data from any distribution to as close to a Gaussian # distribution as possible in order to stabilize variance and minimize skewness. elif mode == 'nonlinear-power': # TODO: this function can not be found. raise ValueError("This option did not be realized!") ## Normalization # Normalization is the process of scaling individual samples to have unit norm. elif mode == 'norm-l1': ret = skpre.normalize(X, norm='l1', axis=axis, **kwargs) elif mode == 'norm-l2': ret = skpre.normalize(X, norm='l2', axis=axis, **kwargs) elif mode == 'norm-max': ret = skpre.normalize(X, norm='max', axis=axis, **kwargs) ## some other often-used non-linear normalization methods elif mode == 'sigmoid': sigmoid = (lambda x: 1.0 / (1.0 + np.exp(-x))) ret = sigmoid(X) elif mode == 'tanh': ret = np.tanh(X, **kwargs) else: raise ValueError("No normalization type with {} found !".format(mode)) return ret
def num_quantile(X, cols, predix='q_', **params): # QuantileTransformer if 'random_state' not in params: params['random_state'] = 0 _X = X.copy() _cols = [prefix + col for col in cols] _x = quantile_transform(X[cols], **params) _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index) return _X, _cols
def __qt(self, x): transformed = quantile_transform(np.reshape(x, (-1, 1)), axis=0, copy=True, n_quantiles=len(x), output_distribution='normal') transformed = transformed.flatten() transformed = transformed - np.min(transformed) return (transformed)
def DS_normalization(self, dataset): # Normalize numeric columns with MinMax normalization d = dataset if self.verbose: print("DS normalizing...") if (type(dataset) != pd.core.series.Series): Xf = dataset.select_dtypes(['number']) X = Xf.dropna() X_na = Xf[Xf.isnull().any(axis=1)] Y = dataset.select_dtypes(['object']) Z = dataset.select_dtypes(['datetime64']) scaled_values = quantile_transform( X, n_quantiles=10, random_state=0) scaled_X = pd.DataFrame( scaled_values, index=X.index, columns=X.columns) scaled_Xf = pd.concat( [scaled_X, X_na], ignore_index=False, sort=True).sort_index() df = scaled_Xf.join(Y) df = df.join(Z) if (self.exclude in list(df.columns.values)): df[str(self.exclude)] = d[str(self.exclude)].values else: # (sum([type(x)=='number' for x in dataset])/len(dataset)==1): X = dataset.dropna() X_na = dataset[dataset.isna()] scaled_X = X.quantile(q=0.1, interpolation='linear') scaled_Xf = pd.concat( [scaled_X, X_na], ignore_index=False, sort=True).sort_index() df = pd.Series(scaled_Xf, index=X.index, columns=X.columns) if (self.exclude in list(pd.DataFrame(df).columns.values)): df = dataset return df.sort_index()
def convert_dates(posts): """ Function to convert dates to coefficients from 0 to 1 """ dates_columns = ["created"] for column in dates_columns: posts[column] = pd.to_datetime(posts[column]).apply(lambda x: x.value) posts[column] = quantile_transform(posts[column].values.reshape( -1, 1)).reshape(-1) return posts
def __call__(self, data) -> Table: _data = data.copy() _data.X = quantile_transform( _data.X, n_quantiles=self.n_quantiles, output_distribution=self.output_distribution, copy=True, axis=self.axis, ) return _data
def quantile_transform_scaler(df: DataFrame) -> DataFrame: sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] quant_trans = lambda x: preprocessing.quantile_transform(x.to_frame() ).flatten() df[sample_header_names] = df[sample_header_names].transform(quant_trans) return df
def quantile_transform_columns(df, n_quantiles=10, output_distribution='uniform'): return pd.DataFrame(quantile_transform( df, axis=0, n_quantiles=n_quantiles, output_distribution=output_distribution, copy=True), index=df.index, columns=df.columns)
def plot_reconstructions( co_mat_coo: sparse.coo_matrix, params: Params, max_dim: int, plot_interval: int = 1, ): # remove skew for better visualisation co_mat_normal_csr: sparse.csr_matrix = quantile_transform( co_mat_coo, axis=0, output_distribution='normal', n_quantiles=co_mat_coo.shape[0], copy=True, ignore_implicit_zeros=True) # don't use sparse svd: doesn't result in accurate reconstruction co_mat_normal_dense = co_mat_normal_csr.toarray() U, s, VT = np.linalg.svd(co_mat_normal_dense, compute_uv=True) fig_size = ( co_mat_normal_dense.shape[1] // 1000 + 1 * 2, co_mat_normal_dense.shape[0] // 1000 + 1 * 2 + 0.5, ) print(f'fig size={fig_size}') print(params.direction) base_title = make_fig_title(params) base_title += f'num co-occurrences={np.sum(co_mat_coo)}\n' # plot projection of co_mat onto sing dims dg0, dg1 = None, None projections = np.zeros(co_mat_normal_dense.shape, dtype=np.float) num_s = sum(s > 0) for dim_id in range(max_dim): projection = calc_projection(U, s, VT, dim_id) projection_clustered, dg0, dg1 = cluster(projection, dg0, dg1) projections += projection_clustered if dim_id % plot_interval == 0: plot_heatmap( projections, title=base_title + f'projections={dim_id}/{num_s}', save_path=make_path(params.age, params.direction) / f'dim{dim_id:04}.png', vmin=np.min(co_mat_normal_dense), vmax=np.max(co_mat_normal_dense), figsize=fig_size, ) # plot original plot_heatmap( cluster(co_mat_normal_dense, dg0, dg1)[0], title=base_title + 'original', save_path=make_path(params.age, params.direction) / f'dim{num_s:04}.png', vmin=np.min(co_mat_normal_dense), vmax=np.max(co_mat_normal_dense), figsize=fig_size, )
############################################################################### ############################################################################### # In a similar manner, the boston housing data set is used to show the impact # of transforming the targets before learning a model. In this example, the # targets to be predicted corresponds to the weighted distances to the five # Boston employment centers. from sklearn.datasets import load_boston from sklearn.preprocessing import QuantileTransformer, quantile_transform dataset = load_boston() target = np.array(dataset.feature_names) == "DIS" X = dataset.data[:, np.logical_not(target)] y = dataset.data[:, target].squeeze() y_trans = quantile_transform(dataset.data[:, target], output_distribution='normal').squeeze() ############################################################################### # A :class:`sklearn.preprocessing.QuantileTransformer` is used such that the # targets follows a normal distribution before applying a # :class:`sklearn.linear_model.RidgeCV` model. f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, normed=True) ax0.set_ylabel('Probability') ax0.set_xlabel('Target') ax0.set_title('Target distribution') ax1.hist(y_trans, bins=100, normed=True) ax1.set_ylabel('Probability')