Exemplo n.º 1
0
def draw_combined_hist(df, countries, country_names, winsorize=False):
    #make a combined histogram
    plt.cla()
    plt.figure(1, figsize=(5, 3))

    TRIM = 0.05 if winsorize == True else 0.0
    m = 0
    #get the max value amongsts all series that we are going to plot
    for c in countries:
        x = df[c].dropna().values
        x = mstats.winsorize(x, (0, TRIM))
        if max(x) > m:
            m = max(x)
    #we have the max value, now plot each series, bins are decided based on max
    i = 0
    for c in countries:
        x = df[c].dropna().values
        x = mstats.winsorize(x, (0, TRIM))
        bins = np.linspace(0, m, m)
        plt.hist(x, bins, alpha=0.5, label=country_names[i])
        i += 1

    plt.legend(loc='upper right')
    plt.title(
        'Histogram for distribution of Starbucks stores\n across cities in a country'
    )
    name = 'hist.png' if winsorize == False else 'winsorized_hist.png'
    fname = os.path.join(glob.OUTPUT_DIR_NAME, glob.EDA_DIR, 'more', name)
    plt.savefig(fname)
Exemplo n.º 2
0
Arquivo: eda.py Projeto: ekaynac/Funcs
    def plots(df,
              transformation="boxcox",
              fig_size=(15, 8),
              whis=1.5,
              wins=(0, 0)):

        features_to_plot = df.columns[df.dtypes != object]
        positive_features = list()

        for feature in features_to_plot:
            if (df[feature] > 0).all():
                positive_features.append(feature)

        if transformation == "boxcox":
            for feature in positive_features:
                df[feature], _ = boxcox(winsorize(df[feature], wins))
                plt.figure(figsize=fig_size)
                plt.subplot(1, 2, 1)
                sns.boxplot(df[feature], whis=whis)
                plt.subplot(1, 2, 2)
                sns.distplot(df[feature])
                plt.show()

        elif transformation == "log":
            for feature in positive_features:
                df[feature] = np.log(winsorize(df[feature], wins))
                plt.figure(figsize=fig_size)
                plt.subplot(1, 2, 1)
                sns.boxplot(df[feature])
                plt.subplot(1, 2, 2)
                sns.distplot(df[feature])
                plt.show()

        else:
            print("tranformation type should be one of these:\n*log\n*boxcox")
Exemplo n.º 3
0
def get_index_list_pe_pb_date(code_list, date):
    '''指定日期的指数PE_PB'''
    ret_dict = {}
    df_all = get_fundamentals(query(valuation), date)  # 某日所有股票
    for code in code_list:
        stocks = get_idx_components(code, date)
        df = df_all[df_all['code'].isin(stocks)]  # 某个指数
        if len(df) > 0:
            # 整体法,市值加权
            df = df[df.pb_ratio != 0]  # 去除0
            df = df[df.pe_ratio != 0]  # 去除0
            pe1 = sum(df.market_cap) / sum(df.market_cap / df.pe_ratio)
            pb1 = sum(df.market_cap) / sum(df.market_cap / df.pb_ratio)
            # 等权,亏损置零
            pe2 = len(df) / sum(1 / df.pe_ratio[df.pe_ratio > 0])
            pb2 = len(df) / sum(1 / df.pb_ratio[df.pb_ratio > 0])
            # 中位数,无需预处理
            pe3 = df.pe_ratio.median()
            pb3 = df.pb_ratio.median()
            # 算数平均,取分位数95%置信区间
            pe4 = mean(mstats.winsorize(df.pe_ratio, limits=0.025))
            pb4 = mean(mstats.winsorize(df.pb_ratio, limits=0.025))

            ret_dict[code] = {
                'pe1': round(pe1, 2),
                'pb1': round(pb1, 2),
                'pe2': round(pe2, 2),
                'pb2': round(pb2, 2),
                'pe3': round(pe3, 2),
                'pb3': round(pb3, 2),
                'pe4': round(pe4, 2),
                'pb4': round(pb4, 2),
            }
    return ret_dict
Exemplo n.º 4
0
 def winsorize_quantTrans(self,lower = 0.10, upper = 0.10,ignore_zero = True):
     '''Winsorizes quantity
     
     
     PARAMETERS
     
     lower:  lower percentile in which to convert values. All values below this threshold will be converted to the
             lower percentile value
             
     upper:  upper percentile in which to convert values. All values above this threshold will be converted to the
             upper percentile value
     
     ignore_zero: winsorize on non-zero values
     
     RETURNS
     
     converts ratings to winsorized values
     
     '''
     if ignore_zero:
         nonzero_ind = np.nonzero(self.quantity)[0]
         self.quantity[nonzero_ind] =  mstats.winsorize(self.quantity[nonzero_ind], limits=[lower, upper])
         self._quantity['Transformation'].append('Winsorized nonzeros based on limits {}, {}'.format(lower,
                                                                                                     upper))
     else:
         self.quantity = np.array(mstats.winsorize(self.quantity,limits=[lower, upper]))
         self._quantity['Transformation'].append('Winsorized based on limits {}, {}'.format(lower,upper))
Exemplo n.º 5
0
 def test_winsorization(self):
     "Tests the Winsorization of the data."
     data = ma.array([77, 87, 88, 114, 151, 210, 219, 246, 253, 262, 296, 299, 306, 376, 428, 515, 666, 1310, 2611])
     assert_almost_equal(mstats.winsorize(data, (0.2, 0.2)).var(ddof=1), 21551.4, 1)
     data[5] = masked
     winsorized = mstats.winsorize(data)
     assert_equal(winsorized.mask, data.mask)
Exemplo n.º 6
0
 def test_winsorization(self):
     data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
                      296,299,306,376,428,515,666,1310,2611])
     assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1),
                         21551.4, 1)
     data[5] = masked
     winsorized = mstats.winsorize(data)
     assert_equal(winsorized.mask, data.mask)
    def standardize(df):
        '''

        :param df: (pandas dataframe) dataframe with columns: years, index: permnos, entries: metric_results
        :return: (pandas dataframe) winsorized dataframe of z_scores

        '''
        # Keep the middle 95% of data. Clip the rest. NaN's in data will change percentiles.
        winsorize(df, (0.025, 0.025), inplace=True)
        z_score = (df - df.mean()) / df.std()
        return z_score
def normalize_image_0_1_by_3channel(X, y):
    print('--- Normalizing by channel, expecting 3 channel input ---')
    assert X.shape[4] == 3
    X_winsor = np.copy(X)
    X_winsor[:, :, :, :, 0] = mstats.winsorize(X[:, :, :, :, 0], [0, 0.1])
    X_winsor[:, :, :, :, 1] = mstats.winsorize(X[:, :, :, :, 1], [0, 0.1])
    X_winsor[:, :, :, :, 2] = mstats.winsorize(X[:, :, :, :, 2], [0, 0.1])

    X_norm = X_winsor - X_winsor.min()
    X_norm = X_norm / X_winsor.std()

    return X_norm, y
Exemplo n.º 9
0
def playerdistribution(player, quantile=0.75):

    jc = players21.loc[players21['Player'] == name][['GameFP/36', 'MIN']]
    fp = winsorize(jc['GameFP/36'], [0.05, 0.05]).mean()
    minutes = winsorize(jc['MIN'], [0.05, 0.05]).mean()
    cj = jc.T
    covariance = np.cov(cj)
    distribution = pd.DataFrame(
        multivariate_normal.rvs(mean=[fp, minutes], cov=covariance, size=1000))
    distribution['total'] = distribution[0] / 36 * distribution[1]
    graph = sb.kdeplot(data=distribution['total'], fill=True)

    return graph, distribution['total'].quantile(quantile), covariance
Exemplo n.º 10
0
def wincor(x, y, tr=.2):

    """
    Compute the winsorized correlation between `x` and `y`.
    This function also returns the winsorized covariance.


    :param x: Pandas Series
    Data for group one

    :param y: Pandas Series
    Data for group two

    :param tr: float
    Proportion to winsorize (default is .2)

    :return:
    Dictionary of results

    cor: float
    Winsorized correlation

    nval: int
    Number of observations

    sig: float
    p-value

    wcov: float
    Winsorized covariance
    """

    if type(x) is not np.ndarray:
        x, y=pandas_to_arrays([x, y])

    m1 = np.c_[x, y] # cbind
    m1 = m1[~np.isnan(m1).any(axis=1)]
    nval = m1.shape[0]
    x = m1[:, 0]
    y = m1[:, 1]
    g = np.floor(tr * len(x))
    xvec = winsorize(x, limits=(tr,tr))
    yvec = winsorize(y, limits=(tr,tr))
    wcor = np.corrcoef(xvec, yvec)[0,1]
    wcov = np.cov(xvec, yvec)[0,1]
    test = wcor * np.sqrt((len(x) - 2) / (1. - wcor ** 2))
    sig = 2 * (1 - t.cdf(abs(test), len(x) - 2 * g - 2))

    res={'cor': wcor, 'wcov': wcov, 'sig': sig, 'nval': nval}

    return res
Exemplo n.º 11
0
def do_prepare_data():
    main_df = pd.read_csv("tables/1_thirty_models.csv")
    model_keys = main_df['model_key'].unique().tolist()
    prepared_df = pd.DataFrame()
    for model_key in model_keys:
        df = load_data(main_df=main_df, model_key=model_key)
        df = fill_missing_days__and_set_datetime_index(df,
                                                       start_date="2012-01-05",
                                                       end_date="2016-12-28")
        df.to_csv("tables/_fill_days.csv")
        df.loc[:, 'Order_Demand'] = mstats.winsorize(df['Order_Demand'].values,
                                                     limits=[0.05, 0.05])
        # df['Order_Demand'] = df['Order_Demand'].apply(lambda x: np.log(x+1))
        df = moving_average_imputation(df)
        plot(df, model_key, '2_imputation_example')
        df['model_key'] = df['Order_Demand'].apply(lambda x: model_key)
        for i in range(1, 9):
            df['lag_{}'.format(i)] = df['Order_Demand'].shift(i)
        df = df.dropna()
        df['Date'] = df.index
        prepared_df = pd.concat([prepared_df, df])
    prepared_df = sqldf(
        "select model_key, Date, Order_Demand, lag_1, lag_2, lag_3, lag_4, lag_5, lag_6, lag_7, lag_8 from prepared_df ORDER BY 1,2",
        locals())
    prepared_df['Date'] = pd.to_datetime(prepared_df['Date'])
    prepared_df.to_csv("tables/2_data_prepared.csv", index=False)
    prepared_df.set_index(['Date'], inplace=True)
    thirty_plots(df=prepared_df, filename="3_prepared_data")
 def win_sor(self, limit):
   ''' 去极值函数'''
   ''' 替换极大或极小的因子值'''
   ''' 本质上修改了数据集'''
   ''' limit =0.1, 替换极大的百分之10数据和极小的百分之10数据'''
   
   self.data[self.feature] = winsorize(self.data[self.feature], limits=[limit, limit])
Exemplo n.º 13
0
def future_spreads(spread_current, spread_history):

    norm_reference = cma.val_dict['spread_norm_yrs']

    future_spreads = pd.DataFrame(spread_current).T
    future_spreads = future_spreads.reset_index(drop=True)

    # Winsorize spread norm
    spread_norm = []
    for i in range(len(spread_history.columns)):
        list = mstats.winsorize(spread_history.iloc[:, i].dropna(),
                                limits=[0.05, 0.05],
                                inclusive=[True, True])

        def Average(lst):
            return sum(lst) / len(lst)

        spread_norm.append(Average(list))

    # Create shell dataframe
    for i in range(1, 11):
        future_spreads.loc[i] = np.nan

    # Calculate normalized spread path
    for i in range(norm_reference, 11):
        future_spreads.iloc[i, :] = spread_norm

    # Populate data for years leading up to normalization
    for i in range(1, norm_reference):
        future_spreads.iloc[i, :] = (
            future_spreads.iloc[norm_reference, :] - future_spreads.iloc[0, :]
        ) / norm_reference + future_spreads.iloc[i - 1, :]

    return future_spreads
Exemplo n.º 14
0
def cs_winFn(ser, level):
    indx = ser.index
    ser = ser.dropna()
    arr = array(ser)
    arr = winsorize(arr, limits=(level, level))
    ser = Series(arr, index=ser.index)
    return ser.reindex(indx)
Exemplo n.º 15
0
 def _sub(sub):
     # winsorize returns an numpy array, sub is a dataframe; sub[:] replaces the "values" of the dataframe,
     # not the dataframe itself
     sub[:] = mstats.winsorize(a=sub.values,
                               limits=winsorize_bounds,
                               axis=0)
     return sub
Exemplo n.º 16
0
def winvar(x, tr=.2):
    """
    Compute the gamma Winsorized variance for the data in the vector x.
    tr is the amount of Winsorization which defaults to .2.
    Nan values are removed.

    :param x:
    :param tr:
    :return:
    """

    y=winsorize(x, limits=(tr,tr))
    wv = np.var(y, ddof=1)

    # x=x[~np.isnan(x)]
    # y=np.sort(x)
    # n=len(x)
    # ibot = int(np.floor(tr * n))
    # itop = len(x) - ibot -1
    # xbot = y[ibot]
    # xtop = y[itop]
    # y = np.where(y <= xbot, xbot, y)
    # y = np.where(y >= xtop, xtop, y)
    # wv = np.var(y, ddof=1) # DF to be consistent with Wilcox/R

    return wv
Exemplo n.º 17
0
def get_norm_side(mean, vol, ret, z):
    side = winsorize(ret, limits=[0.025, 0.025])
    side[(ret - mean) / np.sqrt(vol) > z] = 1
    side[(ret - mean) / np.sqrt(vol) < -z] = -1
    side[((ret - mean) / np.sqrt(vol) >= -z)
         & ((ret - mean) / np.sqrt(vol) <= z)] = 0
    return side
def treat_outliers(dataframe):
    cols = list(dataframe)
    for col in cols:
        if col in dataframe.select_dtypes(include=np.number).columns:
            dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.1],inclusive=(True, True))
    
    return dataframe    
Exemplo n.º 19
0
def et(ax, d):
    print "ET-------------"

    A = np.transpose(d.jx)
    (lines, cols) = np.shape(A)
    window = np.hamming(lines).reshape(lines, 1)
    #A *= window

    ny, nx = np.shape(A)
    #print "shape:", np.shape(A)

    # configuration space parameter
    #x = np.arange(nx)*dx
    x = d.x
    #print "x:"
    #print x
    x1 = 1
    x2 = nx

    #temporal guiding vector
    #t = np.arange(ny)*dt
    t = d.time
    #print "t:"
    #print t
    t1 = 0
    t2 = ny

    # Change to spectra by considering |F]
    #F = A*A #energy
    F = A
    print "min/max:", np.min(F), np.max(F)

    X, T = np.meshgrid(x, t)

    #print "nx=",nx
    #print "ny=",ny
    #print np.shape(X)
    #print np.shape(T)

    #slow (but more flexible) pcolormesh that takes guiding grid
    #im = ax.pcolormesh(K[w1:w2, k1:k2], W[w1:w2, k1:k2], F[w1:w2, k1:k2],
    #            cmap='plasma',
    #            #vmin=ff.min(),
    #            #vmax=ff.max(),
    #            )

    F = mstats.winsorize(F, limits=[0.01, 0.01])
    vminmax = np.maximum(np.abs(np.min(F)), np.abs(np.max(F)))

    #faster plotting with imshow
    im = ax.imshow(F[t1:t2, x1:x2],
                   extent=[x[x1], x[x2 - 1], t[t1], t[t2 - 1]],
                   origin='lower',
                   aspect='auto',
                   interpolation='nearest',
                   cmap='RdYlGn',
                   vmin=-vminmax,
                   vmax=vminmax)

    return F
Exemplo n.º 20
0
def tobc(curr_arr, block_size, central_tendency, cutting_ratio, percentile):
    answer = []
    for curr_col in range(np.shape(curr_arr)[1]):
        curr_row = 0
        curr_col_moving = curr_col
        sum_ = 0
        median_list = []
        while curr_row < np.shape(curr_arr)[0]:
            sum_ = sum_ + curr_arr[curr_row][curr_col_moving]
            median_list.append(curr_arr[curr_row][curr_col_moving])
            curr_col_moving = curr_col_moving + 1
            if curr_col_moving == np.shape(curr_arr)[1]:
                curr_col_moving = 0
            curr_row = curr_row + 1
        if central_tendency == "sum":
            temp_var = sum_
        if central_tendency == "mean":
            temp_var = sum_ / block_size[0]
        if central_tendency == "median":
            temp_var = median(median_list)
        if central_tendency == "trim_mean":
            temp_var = stats.trim_mean(np.array(median_list), cutting_ratio)
        if central_tendency == "percentile":
            temp_var = np.percentile(np.array(median_list), percentile)
        if central_tendency == "win_mean":
            temp_var = mean(winsorize(np.array(median_list), cutting_ratio))
        answer.append(temp_var)
    mid_answer = np.copy(circulant(answer).transpose())
    return mid_answer
Exemplo n.º 21
0
def preprocess(a):
    a = a.astype(np.float64)
    a[np.isinf(a)] = np.nan
    a = np.nan_to_num(a - np.nanmean(a))
    a = winsorize(a, limits=[WIN_LIMIT, WIN_LIMIT])

    return preprocessing.scale(a)
Exemplo n.º 22
0
def preprocess(a):
    
    a = a.astype(np.float64)
    a[np.isinf(a)] = np.nan
    a = np.nan_to_num(a - np.nanmean(a))
    a = winsorize(a, limits=[0.02,0.98])
    
    return a
Exemplo n.º 23
0
def normalize_image_0_1_by_6channel(X, y):
    print('--- Normalizing by channel, expecting 6 channel input ---')
    assert X.shape[4] == 6
    X_winsor = np.copy(X)
    # Only winsorize across 0 1 2, since 3 4 5 is already [0 1]
    X_winsor[:, :, :, :, 0] = mstats.winsorize(X[:, :, :, :, 0], [0, 0.1])
    X_winsor[:, :, :, :, 1] = mstats.winsorize(X[:, :, :, :, 1], [0, 0.1])
    X_winsor[:, :, :, :, 2] = mstats.winsorize(X[:, :, :, :, 2], [0, 0.1])

    X_norm = np.copy(X_winsor)
    X_norm[:, :, :, :,
           0:3] = X_winsor[:, :, :, :, 0:3] - X_winsor[:, :, :, :, 0:3].min()
    X_norm[:, :, :, :, 0:3] = np.divide(X_norm[:, :, :, :, 0:3],
                                        X_winsor[:, :, :, :, 0:3].std(),
                                        where=X_winsor[:, :, :, :, 0:3] > 0)

    return X_norm, y
Exemplo n.º 24
0
def vectorWinsorize(input_v, limits=0.5):
    '''
    calculate z-score of each element in input_v to mean(input_v)
    '''

    result = np.copy(input_v)
    result = winsorize(result, limits=limits)

    return result
Exemplo n.º 25
0
    def winsorize_df(self, train_df, test_df, valid_df, cols, lower, upper, test_set, valid_set):
        """Function to winsorize numeric values in a DataFrame to remove potential outliers

        Parameters
        ----------
        train_df : pandas.DataFrame
            DataFrame containing the training data
        test_df : pandas.DataFrame
            DataFrame containing the test data
                train_df : pandas.DataFrame
            DataFrame containing the training data
        valid_df : pandas.DataFrame
            DataFrame containing the validation data
        cols : list
            list of columns to winsorize
        lower : int
            Lower value e.g. 0.05 will cap the data at the 5th percentile
        upper : int
            Upper value e.g. 0.05 will cap the data at the 95th percentile
        test_set : bool
            Boolean indicating if a test set has been provided
        valid_set : bool
            Boolean indicating if a validation set has been provided

        Returns
        -------
        train_df : pandas.DataFrame
           DataFrame containing the winsorized training data
        test_df : pandas.DataFrame
            DataFrame containing the winsorized test data
        valid_df : pandas.DataFrame
            DataFrame containing the winsorized validation data

        """

        for i in range(0, len(cols)):

            winsor = pd.DataFrame(winsorize(train_df[cols[i]], limits=(lower, upper)))
            winsor.columns = [cols[i]]

            winsor_min = winsor[cols[i]].min()
            winsor_max = winsor[cols[i]].max()

            # Replace the column with the winsorized version for the training data
            train_df.drop(cols[i], axis=1)
            pd.concat([train_df, winsor], axis=1)

            # Now replace in the test and validation sets using the values from the training data
            if test_set:
                test_df.loc[test_df[cols[i]] > winsor_max, cols[i]] = winsor_max
                test_df.loc[test_df[cols[i]] < winsor_min, cols[i]] = winsor_min

            if valid_set:
                valid_df.loc[valid_df[cols[i]] > winsor_max, cols[i]] = winsor_max
                valid_df.loc[valid_df[cols[i]] < winsor_min, cols[i]] = winsor_min

            return train_df, test_df, valid_df
Exemplo n.º 26
0
 def Winsorize(self, df, colname, tile):
     dfpivot2 = df[colname].astype(float).values
     mask = np.isnan(dfpivot2)
     wnp = mstats.winsorize(dfpivot2,
                            limits=[tile, tile],
                            axis=0,
                            inplace=True)
     wnp[mask] = np.nan  #Inplace true will fill all np.nan value with extreme value
     df[colname] = wnp
     return (df)
Exemplo n.º 27
0
 def transform(self, X):
     if self.method == 'winsorize':
         for col in X.columns.tolist():
             X.loc[:, col] = mstats.winsorize(X[col],
                                              limits=(self.low, self.high))
     elif self.method == 'ceilfloor':
         for col in X.columns.tolist():
             # settingwithcopy warning
             X.loc[X[col] < self.low, col] = self.low
             X.loc[X[col] > self.high, col] = self.high
     return X
Exemplo n.º 28
0
    def transform(self, X):
        data = X.copy()
  
        lim_inf_1 = len(data[data['NOTA_DE']==0.0])/len(data)
        lim_sup_1 = len(data[data['NOTA_DE']> 10.0])/len(data)
        data['NOTA_DE'] = winsorize(data['NOTA_DE'], limits=[lim_inf_1, lim_sup_1] )

        lim_inf_2 = len(data[data['NOTA_EM']==0.0])/len(data)
        lim_sup_2 = len(data[data['NOTA_EM']> 10.0])/len(data)
        data['NOTA_EM'] = winsorize(data['NOTA_EM'], limits=[lim_inf_2, lim_sup_2] )

        lim_inf_3 = len(data[data['NOTA_MF']==0.0])/len(data)
        lim_sup_3 = len(data[data['NOTA_MF']> 10.0])/len(data)
        data['NOTA_MF'] = winsorize(data['NOTA_MF'], limits=[lim_inf_3, lim_sup_3] )

        lim_inf_4 = len(data[data['NOTA_GO']==0.0])/len(data)
        lim_sup_4 = len(data[data['NOTA_GO']> 10.0])/len(data)
        data['NOTA_GO'] = winsorize(data['NOTA_GO'], limits=[lim_inf_4, lim_sup_4] )

        return data
Exemplo n.º 29
0
def ols_beta(df, window):
    model = pd.stats.ols.MovingOLS(y=df.ic,
                                   x=df[['ih']],
                                   window_type='rolling',
                                   window=window,
                                   intercept=True)
    df['ols_beta'] = model.beta.ih.shift(1)
    df['ols_r2'] = model.r2
    df.ols_beta = mstats.winsorize(df.ols_beta, limits=[0.01, 0.01])
    #df = df.replace([np.inf, -np.inf], np.nan).dropna()
    return df
Exemplo n.º 30
0
def _winsorize(a, limits=None, inclusive=(True, True)):
    # drop masked data
    a1 = np.ma.compressed(a)

    # use .data to return an np.ndarray instead of a masked array
    try:
        wa = winsorize(a1, limits=limits, inclusive=inclusive).data
    except IndexError:
        wa = np.zeros(0, dtype=a.dtype)

    return wa
Exemplo n.º 31
0
    def fm_regression(self):
        data = self.cache_data.loc[:, ['forward_return'] + self.cache['factor_names']].copy()
        # need to winsorize
        for factor in self.cache['factor_names']:
            data[factor] = data[factor].apply(lambda x: winsorize(x, (0.25, 0.25)))
        # cross-sectional regression

        # time-series regression

        # test
        pass
def winsorize_norm_chromosome_data(read_5p_ends, chromosome, strand, genome_dict, nucs_to_count, to_winsorize = True, low = 0, high = 0.95):
    """


    :param read_5p_ends:
    :param chromosome:
    :param strand:
    :param genome_dict:
    :param nucs_to_count:
    :param low:
    :param high:
    :return: an array (now zero-indexed from 1-indexed) of densities for the given chromosome on the given strand, winsorized, and only for the given nucleotides
    """
    max_position = max(read_5p_ends[strand][chromosome].keys())
    density_array =numpy.array([0] * max_position)
    for position in read_5p_ends[strand][chromosome].keys():
        if genome_dict[chromosome][position-1] in nucs_to_count:
            density_array[position-1] = read_5p_ends[strand][chromosome][position]
    if to_winsorize:
        winsorize(density_array, limits = (low, 1-high), inplace = True)
    normed_array = density_array/float(max(density_array))
    return  normed_array
Exemplo n.º 33
0
def normalize_dict_to_max(mutation_dict, winsorize_data = False, winsorization_limits = (0, 0.95)):
    all_values = []
    normed_dict = {}
    for strand in mutation_dict:
        normed_dict[strand] = {}
        for chromosome in mutation_dict[strand]:
            normed_dict[strand][chromosome] = {}
            #print mutation_dict[strand][chromosome].values()
            all_values += mutation_dict[strand][chromosome].values()
            #print all_values
    if winsorize_data:
        winsorize(all_values, limits = (winsorization_limits[0], 1-winsorization_limits[1]), inplace = True)

    max_value = float(max(all_values))
    for strand in mutation_dict:
        for chromosome in mutation_dict[strand]:
            for position in mutation_dict[strand][chromosome]:
                val = mutation_dict[strand][chromosome][position]
                if val < min(all_values):
                    val = min(all_values)
                if val > max(all_values):
                    val = max(all_values)
                normed_dict[strand][chromosome][position] = val/max_value
    return normed_dict
Exemplo n.º 34
0
    def fit_transform(self, X):
        self.fitted = True

        sizeSig = np.size(X,1)/self.cardElec

        for numElec in range(self.cardElec):
            elecSlice = slice(numElec*sizeSig,(numElec+1)*sizeSig,1)
            test = X[:,elecSlice]
            X[:,elecSlice] = winsorize(X[:,elecSlice],limits=self.limit)
            test = X[:,elecSlice]
            
        self.maximum = np.max(X)
        self.minimum = np.min(X)
        
        return X
Exemplo n.º 35
0
 def winsorize_series(group,trim_prop):
     return mstats.winsorize(group, limits=[trim_prop,trim_prop]).mean()
Exemplo n.º 36
0
def winsor(s):
    return mstats.winsorize(s, limits=[0.01, 0.01])