示例#1
0
def quantile_norm(dfs: [pd.DataFrame] or pd.DataFrame):
    """Perform quantile normalization on expression data.

        Quantile normalize a list of DataFrames using sklearn.Preprocessing.quantile_transform with default n_quantiles=1000.
        The DataFrame(s) passed should contain samples from the same batch. Each DataFrame should be formatted with columns
        as samples and rows as genes.

    TODO: as of now the function assumes the gene order and number are the same across samples. This should change

    Args:
        dfs: a DataFrame or list of DataFrames from the same batch

    Returns: a dataframe containing

    """
    transformed = list()

    if isinstance(dfs, list):
        vals = [list(i.columns.values) for i in dfs]
        index = dfs[0].index.tolist()
        for i in range(len(dfs)):
            transformed.append(
                pd.DataFrame(quantile_transform(dfs[i], axis=0),
                             columns=vals[i],
                             index=index,
                             output_distribution='normalk'))
    else:
        vals = dfs.columns.to_list()
        index = dfs.index.to_list()
        transformed = pd.DataFrame(quantile_transform(
            dfs, axis=0, output_distribution='normal'),
                                   columns=vals,
                                   index=index)
    return transformed
示例#2
0
    def _normalize(self, matrix, key):
        """ """
        if self.verbose:
            print('normalizing for {0}...'.format(key))

        if self.normalization['NB_FEATURES_TO_KEEP']:
            self.variance_reducer.nb_features = self.normalization[
                'NB_FEATURES_TO_KEEP']
            matrix = self.variance_reducer.fit_transform(matrix)

        if self.normalization['CUSTOM']:
            custom_norm = self.normalization['CUSTOM']()
            assert (hasattr(custom_norm, 'fit')
                    and hasattr(custom_norm, 'fit_transform'))
            matrix = custom_norm.fit_transform(matrix)

        if self.normalization['TRAIN_MIN_MAX']:
            matrix = MinMaxScaler().fit_transform(matrix.T).T

        if self.normalization['TRAIN_MAD_SCALE']:
            matrix = self.mad_scaler.fit_transform(matrix.T).T

        if self.normalization['TRAIN_ROBUST_SCALE'] or\
           self.normalization['TRAIN_ROBUST_SCALE_TWO_WAY']:
            matrix = self.robust_scaler.fit_transform(matrix)

        if self.normalization['TRAIN_NORM_SCALE']:
            matrix = self.normalizer.fit_transform(matrix)

        if self.normalization['TRAIN_QUANTILE_TRANSFORM']:
            matrix = quantile_transform(matrix, **QUANTILE_OPTION)

        if self.normalization['TRAIN_RANK_NORM']:
            matrix = RankNorm().fit_transform(matrix)

        if self.normalization['TRAIN_CORR_REDUCTION']:
            args = self.normalization['TRAIN_CORR_REDUCTION']
            if args == True:
                args = {}

            if self.verbose:
                print('dim reduction for {0}...'.format(key))

            reducer = CorrelationReducer(**args)
            matrix = reducer.fit_transform(matrix)

            if self.normalization['TRAIN_CORR_RANK_NORM']:
                matrix = RankNorm().fit_transform(matrix)

            if self.normalization['TRAIN_CORR_QUANTILE_NORM']:
                matrix = quantile_transform(matrix, **QUANTILE_OPTION)

            if self.normalization['TRAIN_CORR_NORM_SCALE']:
                matrix = self.normalizer.fit_transform(matrix)

        return np.nan_to_num(matrix)
示例#3
0
def get_features_targets(ok_feats):

    ### success measures
    city = 'london'
    outfolder = '../ProcessedData/' + city + '/'
    successdata = outfolder + 'venues_info/' + city + '_venues_success_measures.csv'
    success = pd.read_csv(successdata, sep='\t', index_col=0)

    ### ward level features
    outfolder = '../ProcessedData/' + city + '/'
    ward_stats_f = outfolder + 'venues_info/venues_ward_full.dat'
    ward_stats = pd.read_csv(ward_stats_f, sep='\t',
                             index_col=0).drop(['ward'], axis=1)

    ### category features
    ward_cats_f = outfolder + 'venues_info/' + city + '_WARD_category_stats.csv'
    ward_cats = pd.read_csv(ward_cats_f, sep='\t', index_col=0).drop(['ward'],
                                                                     axis=1)

    ### network features
    network_meas_f = outfolder + '/networks/' + city + '_COMBINED_networkmeasures.csv'
    network_meas = pd.read_csv(network_meas_f, sep=',',
                               index_col=0).replace([np.inf, -np.inf],
                                                    np.nan).fillna(0.0)

    # merge and filter full feature set
    temp_features = network_meas.join(ward_cats)
    venue_features = temp_features.join(ward_stats)
    filtered_features = pd.DataFrame()

    for feat in ok_feats:
        filtered_features[feat] = venue_features[feat]

    for feat in ok_feats:
        filtered_features = filtered_features[filtered_features[feat] != 0.0]

    # scale features and targets
    X_head = filtered_features.keys()
    X = filtered_features.values
    X = StandardScaler().fit_transform(X)
    X = preprocessing.quantile_transform(X, output_distribution='normal')

    success_t = success.reset_index()
    success_t = success_t[success_t['index'].isin(list(
        filtered_features.index))]

    y = np.asarray(success_t['checkinsCount'])
    y = StandardScaler().fit_transform(y.reshape(-1, 1))
    y = preprocessing.quantile_transform(y, output_distribution='normal')

    yy = np.asarray(success_t['tipCount'])
    yy = StandardScaler().fit_transform(y.reshape(-1, 1))
    yy = preprocessing.quantile_transform(y, output_distribution='normal')

    return X, y, X_head, yy
示例#4
0
def transform_data(count_data, transformation_method,\
                   mean_center=False,\
                   std_unit=False,\
                   return_instance=False,\
                   coef=None):

    transformation_method = transformation_method or ''

    if transformation_method.lower() == 'log':
        transformed_data = np.log(count_data + 1)

    elif transformation_method.lower() == 'voom':
        transformed_data = np.log(count_data)

    elif transformation_method.lower() == 'anscombe':
        transformed_data = np.sqrt(count_data + 3. / 8.)

    elif transformation_method.lower() == 'quantile':
        transformed_data = quantile_transform(count_data,
                                              output_distribution='normal')

    else:
        print('ERROR: not an available transformation method')
        transformed_data = count_data

    # Create NormalizationParameter instance for further processing.
    coef = coef or NormalizationParameter()
    if coef.scaler is None:
        coef.scaler = StandardScaler(with_mean=mean_center, with_std=std_unit)
        coef.scaler.fit(transformed_data)

    if return_instance:
        return coef.scaler.transform(transformed_data), coef
    else:
        return coef.scaler.transform(transformed_data)
def load_data(tag):
    """
    从数据库加载数据
    """
    # 连接数据库
    conn = sqlite3.connect('Data/StackExpert.sqlite')
    # 加载所有数据
    data = pd.read_sql_query(
        """
        SELECT * FROM {:s}
    """.format(tag), conn)
    conn.close()
    # 数据预处理
    data = pre_proc(data)
    # 依照专家用户的判断标准设定用户标签
    target = column_or_1d(
        binarize(pd.DataFrame(data.pop('EXPERT_SCORE')),
                 threshold=99,
                 copy=False)).astype(int)
    col = data.columns
    ind = data.index
    # 数据标准化归一化
    data = quantile_transform(data, copy=False, output_distribution='normal')
    data = pd.DataFrame(maxabs_scale(data), index=ind, columns=col)
    # 计算专家/非专家比例
    cnt = np.bincount(target)
    target = pd.Series(target, index=ind)
    return data, target, np.divide(np.min(cnt), np.max(cnt))
示例#6
0
    def transform(self, X):
        """ Perform the quantile mapping.

        Parameters
        ----------
        X : array_like, shape [n_samples, n_features]
            Samples.
        """
        X = ensure_samples_features(X)

        # maybe detrend the datasets
        if self.detrend:
            x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X)
            x_to_cdf = x_trend.transform(X)
        else:
            x_to_cdf = X

        # do the final mapping
        qt_kws = self.qt_kwargs.copy()
        if "n_quantiles" not in qt_kws:
            qt_kws["n_quantiles"] = len(X)

        x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws)
        x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles)

        # add the trend back
        if self.detrend:
            x_qmapped = x_trend.inverse_transform(x_qmapped)

        return x_qmapped
示例#7
0
def generate_arr_heatmap(ax, arr, cmap=None, constrain=(-1, 1)):
    # remove outliers
    betas = [0, 1, 2, 4, 8, 16, 32, 64]
    caps = [0, 4, 8, 12, 16, 20]
    scaled = arr.copy()
    # sklearn preprocessing works only per axis dimension
    scaled = scaled.reshape((-1, 1))
    scaled = quantile_transform(scaled,
                                n_quantiles=9,
                                output_distribution='uniform')
    scaled /= 3
    scaled = scaled.reshape((len(betas), len(caps)))
    # remove text longer than four characters
    text = arr.astype('str')
    for i in range(text.shape[0]):
        for j in range(text.shape[1]):
            # beta == 0
            if i == 0 and j > 0:
                text[i][j] = ''
            else:
                text[i][j] = format_text(arr[i][j])
    # plot
    plot_funcs.plot_heatmap(ax,
                            heatmap=scaled,
                            row_labels=None,
                            set_row_labels=False,
                            col_labels=[f'$C={c}$' for c in caps],
                            set_col_labels=True,
                            text=text,
                            constrain=constrain,
                            cmap=cmap,
                            grid_fontsize='xx-small')
def clean_anat(in_file, save_dir):
    ### Load brain ###
    brain = load_numpy_brain(in_file)

    ### Blur brain and mask small values ###
    brain_copy = brain.copy().astype('float32')
    brain_copy = scipy.ndimage.filters.gaussian_filter(brain_copy, sigma=10)
    threshold = triangle(brain_copy)
    brain_copy[np.where(brain_copy < threshold / 2)] = 0

    ### Remove blobs outside contiguous brain ###
    labels, label_nb = scipy.ndimage.label(brain_copy)
    brain_label = np.bincount(labels.flatten())[1:].argmax() + 1
    brain_copy = brain.copy().astype('float32')
    brain_copy[np.where(labels != brain_label)] = np.nan

    ### Perform quantile normalization ###
    brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1),
                                   n_quantiles=500,
                                   random_state=0,
                                   copy=True)
    brain_out = brain_out.reshape(brain.shape)
    np.nan_to_num(brain_out, copy=False)

    ### Save brain ###
    fname = in_file.split('/')[-1].split('.')[0]
    save_file = os.path.join(save_dir, f'{fname}_clean.nii')
    aff = np.eye(4)
    img = nib.Nifti1Image(brain_out, aff)
    img.to_filename(save_file)
示例#9
0
def quantiles(l):
    ''' Converts a list/series of values to quantiles of same shape.
        sklearn.quantile_transform() requires data to be reshaped
    '''
    qt = quantile_transform(X=l.as_matrix().reshape(-1, 1),
                            n_quantiles=100)
    return qt.reshape(1, -1)[0]
示例#10
0
    def preprocess(self, standard = True, quantile = True, n_quantiles = 1000):
        """
        A function for standardizing the data set as the final part of the preprocessing.

        Parameters:

        standard: bool, if True sklearn.StandardScaler is applied to the data set
        quantile: bool, if True sklearn.quantile_transform is applied to the data with quantiles as given by n_quantiles
        n_quantiles: int, the number of qunatiles to use for the quantile transformer
        
        Returns:

        color_frame: pandas DataFrame, the standardized photometric data set
        labels: numpy array, including either ['UNK', 'QSO', 'GALAXY', 'STAR', 'BAL'] for each object
        obj_names: numpy array, includer the name of each object
        scaler: str, names of the applied preprocessing algorithms

        """
        self.scaler = ''
        if standard:
            self.scaler += 'SS'
            scaler = StandardScaler(copy=True)
            self.color_frame = pd.DataFrame(scaler.fit_transform(self.color_frame), columns = self.color_frame.columns)
        if quantile:
            self.scaler += 'Quantile'
            self.color_frame = pd.DataFrame(quantile_transform(self.color_frame, copy=True, n_quantiles=n_quantiles),
                                            columns = self.color_frame.columns,
                                            )
        return self.color_frame, self.labels, self.obj_names, self.scaler
示例#11
0
def problem2_4_3(area):
    QT = preprocessing.QuantileTransformer(random_state=0, n_quantiles=194)
    area_QT = QT.fit_transform(area)
    print("Use QuantileTransformer:", plot(area_QT))
    qt = quantile_transform(area, n_quantiles=194, random_state=0, copy=True)
    print("Use quantile_transform:", plot(qt))
    return "as shown in the plots"
示例#12
0
def main(args):
    logfile = args['logfile']
    directory = args['directory']  # directory will be a full path anat/moco
    width = 120
    printlog = getattr(flow.Printlog(logfile=logfile), 'print_to_log')

    ### Load brain ###
    brain = np.asarray(nib.load(os.path.join(directory,
                                             file + '.nii')).get_data(),
                       dtype='float32')

    ### Blur brain and mask small values ###
    brain_copy = brain.copy().astype('float32')
    brain_copy = scipy.ndimage.filters.gaussian_filter(brain_copy, sigma=10)
    threshold = triangle(brain_copy)
    brain_copy[np.where(brain_copy < threshold / 2)] = 0

    ### Remove blobs outside contiguous brain ###
    labels, label_nb = ndimage.label(brain_copy)
    brain_label = np.bincount(labels.flatten())[1:].argmax() + 1
    brain_copy = brain.copy().astype('float32')
    brain_copy[np.where(labels != brain_label)] = 0

    ### Perform quantile normalization ###
    brain_3d = quantile_transform(brain_copy.flatten().reshape(-1, 1),
                                  n_quantiles=500,
                                  random_state=0,
                                  copy=True).reshape(brain.shape)

    ### Save brain ###
    save_file = os.path.join(directory, file + '_mean.nii')
    aff = np.eye(4)
    img = nib.Nifti1Image(meanbrain, aff)
    img.to_filename(save_file)
示例#13
0
def normalizer(dframe, features, train_test=False):
    # Train-Test split
    X = dframe.drop(['consume'], axis=1)
    y = dframe.consume

    X_qtrans = quantile_transform(X, output_distribution='normal', random_state=0)

    X_scale = pd.DataFrame(X_qtrans,
                            index=X.index, 
                            columns=X.columns)

    if train_test == True:

        X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=.2, random_state=42)

        # Show the distributions after normalization
        X_train[features].hist(figsize=(12,5))
        pl.suptitle("Train set after normalization")
        X_test[features].hist(figsize=(12,5))
        pl.suptitle("Test set after normalization")

        return X_train, X_test, y_train, y_test

    elif train_test == False:

        # Show the distributions after normalization
        X_scale[features].hist(figsize=(12,5))
        pl.suptitle("X set after normalization")
        
        return X_scale, y
示例#14
0
    def transform(self, X):
        """Perform the quantile mapping.

        Parameters
        ----------
        X : array_like, shape [n_samples, n_features]
            Samples.
        """
        # validate input data
        check_is_fitted(self)
        # TO-DO: fix validate_data fctn
        X = self._validate_data(X)

        # maybe detrend the datasets
        if self.detrend:
            x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X)
            x_to_cdf = x_trend.transform(X)
        else:
            x_to_cdf = X

        # do the final mapping
        qt_kws = self.qt_kwargs.copy()
        if 'n_quantiles' not in qt_kws:
            qt_kws['n_quantiles'] = len(X)

        x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws)
        x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles)

        # add the trend back
        if self.detrend:
            x_qmapped = x_trend.inverse_transform(x_qmapped)

        return x_qmapped
示例#15
0
def dt2_de_novo_pred(intron_lengths,
                     jad_labels,
                     is_primary_donor,
                     is_primary_acceptor,
                     donor_lr_score,
                     acceptor_lr_score,
                     is_annot,
                     classifier='decision_tree'):
    jad_labels = np.asarray(jad_labels)

    is_primary_donor = np.asarray(is_primary_donor)
    is_primary_acceptor = np.asarray(is_primary_acceptor)

    intron_length_quantile = quantile_transform(
        np.asarray(intron_lengths).reshape(-1, 1)).ravel()

    X = np.stack([
        jad_labels,
        is_primary_donor,
        is_primary_acceptor,
        intron_length_quantile,
        donor_lr_score,
        acceptor_lr_score,
    ],
                 axis=1)
    y = np.asarray(is_annot, dtype=np.int)
    pred = _de_novo_pred(X, y, DT2_DENOVO_FEATURES, classifier=classifier)
    return pred
示例#16
0
文件: utils.py 项目: dgergel/xsd
    def transform(self, X):
        ''' Perform the quantile mapping.

        Parameters
        ----------
        X : array_like, shape [n_samples, n_features]
            Samples.
        '''
        X = ensure_samples_features(X)

        # maybe detrend the datasets
        if self.detrend:
            x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X)
            x_to_cdf = x_trend.transform(X)
        else:
            x_to_cdf = X

        # do the final mapping
        x_quantiles = quantile_transform(x_to_cdf)
        x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles)

        # add the trend back
        if self.detrend:
            x_qmapped = x_trend.inverse_transform(x_qmapped)

        return x_qmapped
示例#17
0
def get_propensity_score_bins(ddf, nbins, outcome):

    classifier = LogisticRegression(C=1.0, penalty='l1')

    #encode_cats(df_field, 'gender')
    #encode_cats(df_field, 'location')
    
    try:
        XL = ddf.drop(columns = ['networker', outcome, 'propensity', 'bins'])  
    except:
        XL = ddf.drop(columns = ['networker', outcome])  

        
    XL = StandardScaler().fit_transform(XL)
    XL = preprocessing.quantile_transform(XL, output_distribution = 'normal')
    yL = np.asarray(ddf['networker'])

    classifier.fit(XL, yL)
    yL_pred_cl   = classifier.predict(XL)
    yL_pred_prob = classifier.predict_proba(XL)


    df_prop = pd.DataFrame(ddf)
    df_prop['propensity'] = [p[1] for p in yL_pred_prob]
    

    df_prop['bins'] = pd.qcut(df_prop['propensity'], nbins, ['q' + str(i+1) for i in range(nbins)])
    df_prop.head()
    
    return df_prop, list(set(df_prop.bins))
示例#18
0
def dt1_de_novo_pred(intron_motif,
                     intron_lengths,
                     jad_labels,
                     is_primary_donor,
                     is_primary_acceptor,
                     is_annot,
                     motif_regex='GTAG|GCAG|ATAG',
                     classifier='decision_tree'):
    motif_regex = re.compile(motif_regex)
    is_canon = np.asarray(
        [int(bool(motif_regex.match(m))) for m in intron_motif])

    jad_labels = np.asarray(jad_labels)

    is_primary_donor = np.asarray(is_primary_donor)
    is_primary_acceptor = np.asarray(is_primary_acceptor)

    intron_length_quantile = quantile_transform(
        np.asarray(intron_lengths).reshape(-1, 1)).ravel()

    X = np.stack([
        is_canon, jad_labels, is_primary_donor, is_primary_acceptor,
        intron_length_quantile
    ],
                 axis=1)
    y = np.asarray(is_annot, dtype=np.int)
    pred = _de_novo_pred(X, y, DT1_DENOVO_FEATURES, classifier=classifier)
    return pred
示例#19
0
def main(args):
    logfile = args['logfile']
    directory = args['directory']  # directory will be a full path anat/moco
    width = 120
    printlog = getattr(flow.Printlog(logfile=logfile), 'print_to_log')

    ### Load brain ###
    file = os.path.join(directory, 'anat_red_clean.nii')
    brain = np.asarray(nib.load(file).get_data(), dtype='float32')

    # renormalize to .3-.7
    a = .3
    b = .7
    brain_input = a + (brain) * (b - a)

    # sharpen
    brain_sharp = unsharp_mask(brain_input, radius=3, amount=7)

    # make background nan
    brain_copy = brain_sharp.copy()
    brain_copy[np.where(brain_input < .31)] = np.nan

    # renormalize via quantile
    brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1),
                                   n_quantiles=500,
                                   random_state=0,
                                   copy=True)
    brain_out = brain_out.reshape(brain.shape)
    np.nan_to_num(brain_out, copy=False)

    ### Save brain ###
    save_file = os.path.join(directory, 'anat_red_clean_sharp.nii')
    aff = np.eye(4)
    img = nib.Nifti1Image(brain_out, aff)
    img.to_filename(save_file)
def sharpen_anat(in_file, save_dir):
    ### Load brain ###
    #file = os.path.join(directory, 'anat_red_clean.nii')
    brain = load_numpy_brain(in_file)

    # renormalize to .3-.7
    a = .3
    b = .7
    brain_input = a + (brain) * (b - a)

    # sharpen
    brain_sharp = unsharp_mask(brain_input, radius=3, amount=7)

    # make background nan
    brain_copy = brain_sharp.copy()
    brain_copy[np.where(brain_input < .31)] = np.nan

    # renormalize via quantile
    brain_out = quantile_transform(brain_copy.flatten().reshape(-1, 1),
                                   n_quantiles=500,
                                   random_state=0,
                                   copy=True)
    brain_out = brain_out.reshape(brain.shape)
    np.nan_to_num(brain_out, copy=False)

    ### Save brain ###
    fname = in_file.split('/')[-1].split('.')[0]
    save_file = os.path.join(save_dir, f'{fname}_sharp.nii')
    aff = np.eye(4)
    img = nib.Nifti1Image(brain_out, aff)
    img.to_filename(save_file)
示例#21
0
def generate_heatmap(ax, theme, epochs, latent_dim, theme_data, set_row_labels,
                     set_col_labels):
    # empty numpy array for containing
    betas = [0, 1, 2, 4, 8, 16, 32, 64]
    arr = np.full((len(betas), 6), np.nan)
    # iterate dims, betas, caps
    for row_idx, beta in enumerate(betas):
        caps = [0, 4, 8, 12, 16, 20]
        if beta == 0:
            caps = [0]
        for col_idx, cap in enumerate(caps):
            # key for referencing history data
            key = f'e{epochs}_d{latent_dim}_b{beta}_c{cap}'
            loss_dict = theme_data[key]
            # iterate the ax rows and ax cols
            # each loss_dict contains the loss keys with numpy values of epochs x seeds
            # take the argmin across the epoch axis then take the mean of the seeds
            # arr[row_idx][col_idx] = np.nanmean(np.nanmin(loss_dict[theme], axis=0))
            arr[row_idx][col_idx] = np.nanmean(loss_dict[theme][-1])
    # remove outliers
    scaled = arr.copy()
    # sklearn preprocessing works only per axis dimension
    scaled = scaled.reshape((-1, 1))
    scaled = quantile_transform(scaled,
                                n_quantiles=9,
                                output_distribution='uniform')
    scaled /= 3
    scaled = scaled.reshape((len(betas), len(caps)))
    # remove text longer than four characters
    text = arr.astype('str')
    for i in range(text.shape[0]):
        for j in range(text.shape[1]):
            if i == 0 and j > 0:
                text[i][j] = ''
            else:
                text[i][j] = format_text(arr[i][j])
    # plot
    plot_funcs.plot_heatmap(
        ax,
        heatmap=scaled,
        row_labels=[r'$\beta=' + f'{b}$' for b in betas],
        set_row_labels=set_row_labels,
        col_labels=[f'$C={c}$' for c in [0, 4, 8, 12, 16, 20]],
        set_col_labels=set_col_labels,
        text=text,
        grid_fontsize='xx-small')
    if theme == 'val_loss':
        ax_title = 'Combined loss'
    elif theme == 'val_rec_loss':
        ax_title = 'MSE reconstruction'
    elif theme == 'val_kl':
        ax_title = r'$D_{KL}$'
    elif theme == 'val_kl_beta':
        ax_title = r'$\beta \cdot D_{KL}$'
    elif theme == 'val_kl_beta_cap':
        ax_title = r'$\beta \cdot |D_{KL} - C|$'
    elif theme == 'val_capacity_term':
        ax_title = 'Capacity term $C$'
    ax.set_xlabel(ax_title)
示例#22
0
def normalize(X, mode, **kwargs):
    """
    NOTE: The axis must be set to 1 for time series so as to standardize each sample that is 
    different from the common spreadsheet data and image data. Therefore, it is dose not has the 
    training process to memorize the statistics calculated on training set. In addition, the 
    Transformer classes realized in sklearn can not be used for time series because they are 
    realized to independently normalize/scale/standardize each feature otherwise each sample.
    :param X: 
    :param mode: 
    :param kwargs: 
    :return: 
    """
    axis = 1
    ret = None
    ## Standardization, or mean removal and variance scaling:
    # they might behave badly if the individual features do not more or less look like standard
    # normally distributed data.
    if mode == 'znorm':
        ret = skpre.scale(X, axis=axis, **kwargs)
    elif mode == 'znorm-center':
        ret = skpre.scale(X, axis=axis, with_std=False, **kwargs)
    # MinMax, MaxAbs:
    # The motivation to use this scaling include robustness to very small standard deviations
    # of features and preserving zero entries in sparse data.
    elif mode == 'scale-minmax': # [min, max], default is [0, 1]
        ret = skpre.minmax_scale(X, axis=axis, **kwargs)
    elif mode == 'scale-maxabs': # [-1, 1]
        ret = skpre.maxabs_scale(X, axis=axis, **kwargs)
    # Scaling data with outliers:
    elif mode == 'scale-robust':
        ret = skpre.robust_scale(X, axis=axis, **kwargs)
    ## Non-linear transformation
    # Quantile transforms maps data to a uniform distribution. It should be noted that such
    # operation distort correlations and distance within and across features.
    elif mode == 'nonlinear-quantile':
        ret = skpre.quantile_transform(X, axis=axis, **kwargs)
    # Power transforms aim o map data from any distribution to as close to a Gaussian
    # distribution as possible in order to stabilize variance and minimize skewness.
    elif mode == 'nonlinear-power': # TODO: this function can not be found.
        raise ValueError("This option did not be realized!")
    ## Normalization
    # Normalization is the process of scaling individual samples to have unit norm.
    elif mode == 'norm-l1':
        ret = skpre.normalize(X, norm='l1', axis=axis, **kwargs)
    elif mode == 'norm-l2':
        ret = skpre.normalize(X, norm='l2', axis=axis, **kwargs)
    elif mode == 'norm-max':
        ret = skpre.normalize(X, norm='max', axis=axis, **kwargs)
     ## some other often-used non-linear normalization methods
    elif mode == 'sigmoid':
        sigmoid = (lambda x: 1.0 / (1.0 + np.exp(-x)))
        ret = sigmoid(X)
    elif mode == 'tanh':
        ret = np.tanh(X, **kwargs)
    else:
        raise ValueError("No normalization type with {} found !".format(mode))

    return ret
示例#23
0
def num_quantile(X, cols, predix='q_', **params):
    # QuantileTransformer
    if 'random_state' not in params:
        params['random_state'] = 0
    _X = X.copy()
    _cols = [prefix + col for col in cols]
    _x = quantile_transform(X[cols], **params)
    _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index)
    return _X, _cols
示例#24
0
 def __qt(self, x):
     transformed = quantile_transform(np.reshape(x, (-1, 1)),
                                      axis=0,
                                      copy=True,
                                      n_quantiles=len(x),
                                      output_distribution='normal')
     transformed = transformed.flatten()
     transformed = transformed - np.min(transformed)
     return (transformed)
示例#25
0
    def DS_normalization(self, dataset):
        # Normalize numeric columns with MinMax normalization

        d = dataset

        if self.verbose:

            print("DS normalizing...")

        if (type(dataset) != pd.core.series.Series):

            Xf = dataset.select_dtypes(['number'])

            X = Xf.dropna()

            X_na = Xf[Xf.isnull().any(axis=1)]

            Y = dataset.select_dtypes(['object'])

            Z = dataset.select_dtypes(['datetime64'])

            scaled_values = quantile_transform(
                X, n_quantiles=10, random_state=0)

            scaled_X = pd.DataFrame(
                scaled_values, index=X.index, columns=X.columns)

            scaled_Xf = pd.concat(
                [scaled_X, X_na], ignore_index=False, sort=True).sort_index()

            df = scaled_Xf.join(Y)

            df = df.join(Z)

            if (self.exclude in list(df.columns.values)):

                df[str(self.exclude)] = d[str(self.exclude)].values

        else:  # (sum([type(x)=='number' for x in dataset])/len(dataset)==1):

            X = dataset.dropna()

            X_na = dataset[dataset.isna()]

            scaled_X = X.quantile(q=0.1, interpolation='linear')

            scaled_Xf = pd.concat(
                [scaled_X, X_na], ignore_index=False, sort=True).sort_index()

            df = pd.Series(scaled_Xf, index=X.index, columns=X.columns)

            if (self.exclude in list(pd.DataFrame(df).columns.values)):

                df = dataset

        return df.sort_index()
示例#26
0
def convert_dates(posts):
    """
    Function to convert dates to coefficients from 0 to 1
  """
    dates_columns = ["created"]
    for column in dates_columns:
        posts[column] = pd.to_datetime(posts[column]).apply(lambda x: x.value)
        posts[column] = quantile_transform(posts[column].values.reshape(
            -1, 1)).reshape(-1)
    return posts
示例#27
0
 def __call__(self, data) -> Table:
     _data = data.copy()
     _data.X = quantile_transform(
         _data.X,
         n_quantiles=self.n_quantiles,
         output_distribution=self.output_distribution,
         copy=True,
         axis=self.axis,
     )
     return _data
def quantile_transform_scaler(df: DataFrame) -> DataFrame:

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    quant_trans = lambda x: preprocessing.quantile_transform(x.to_frame()
                                                             ).flatten()
    df[sample_header_names] = df[sample_header_names].transform(quant_trans)

    return df
示例#29
0
def quantile_transform_columns(df,
                               n_quantiles=10,
                               output_distribution='uniform'):
    return pd.DataFrame(quantile_transform(
        df,
        axis=0,
        n_quantiles=n_quantiles,
        output_distribution=output_distribution,
        copy=True),
                        index=df.index,
                        columns=df.columns)
示例#30
0
def plot_reconstructions(
    co_mat_coo: sparse.coo_matrix,
    params: Params,
    max_dim: int,
    plot_interval: int = 1,
):

    # remove skew for better visualisation
    co_mat_normal_csr: sparse.csr_matrix = quantile_transform(
        co_mat_coo,
        axis=0,
        output_distribution='normal',
        n_quantiles=co_mat_coo.shape[0],
        copy=True,
        ignore_implicit_zeros=True)
    # don't use sparse svd: doesn't result in accurate reconstruction
    co_mat_normal_dense = co_mat_normal_csr.toarray()
    U, s, VT = np.linalg.svd(co_mat_normal_dense, compute_uv=True)
    fig_size = (
        co_mat_normal_dense.shape[1] // 1000 + 1 * 2,
        co_mat_normal_dense.shape[0] // 1000 + 1 * 2 + 0.5,
    )
    print(f'fig size={fig_size}')
    print(params.direction)
    base_title = make_fig_title(params)
    base_title += f'num co-occurrences={np.sum(co_mat_coo)}\n'
    # plot projection of co_mat onto sing dims
    dg0, dg1 = None, None
    projections = np.zeros(co_mat_normal_dense.shape, dtype=np.float)
    num_s = sum(s > 0)
    for dim_id in range(max_dim):
        projection = calc_projection(U, s, VT, dim_id)
        projection_clustered, dg0, dg1 = cluster(projection, dg0, dg1)
        projections += projection_clustered
        if dim_id % plot_interval == 0:
            plot_heatmap(
                projections,
                title=base_title + f'projections={dim_id}/{num_s}',
                save_path=make_path(params.age, params.direction) /
                f'dim{dim_id:04}.png',
                vmin=np.min(co_mat_normal_dense),
                vmax=np.max(co_mat_normal_dense),
                figsize=fig_size,
            )
    # plot original
    plot_heatmap(
        cluster(co_mat_normal_dense, dg0, dg1)[0],
        title=base_title + 'original',
        save_path=make_path(params.age, params.direction) /
        f'dim{num_s:04}.png',
        vmin=np.min(co_mat_normal_dense),
        vmax=np.max(co_mat_normal_dense),
        figsize=fig_size,
    )
###############################################################################

###############################################################################
# In a similar manner, the boston housing data set is used to show the impact
# of transforming the targets before learning a model. In this example, the
# targets to be predicted corresponds to the weighted distances to the five
# Boston employment centers.

from sklearn.datasets import load_boston
from sklearn.preprocessing import QuantileTransformer, quantile_transform

dataset = load_boston()
target = np.array(dataset.feature_names) == "DIS"
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()
y_trans = quantile_transform(dataset.data[:, target],
                             output_distribution='normal').squeeze()

###############################################################################
# A :class:`sklearn.preprocessing.QuantileTransformer` is used such that the
# targets follows a normal distribution before applying a
# :class:`sklearn.linear_model.RidgeCV` model.

f, (ax0, ax1) = plt.subplots(1, 2)

ax0.hist(y, bins=100, normed=True)
ax0.set_ylabel('Probability')
ax0.set_xlabel('Target')
ax0.set_title('Target distribution')

ax1.hist(y_trans, bins=100, normed=True)
ax1.set_ylabel('Probability')