def _process(self, element, key=None):
        x, y = (element.dimension_values(i) for i in range(2))
        x_dim, y_dim = (element.dimensions()[i] for i in range(2))

        bins = self.p.bins
        bins = np.array(bins)
        # if bins is None:
        #     bins = 10
        # if isinstance(bins, int):
        #     print(isdatetime(x))
        #     if isdatetime(x):
        #         bins = pd.date_range(x.min(), x.max(), periods=bins)
        #     else:
        #         bins = np.linspace(x.min(), x.max(), bins)
        # elif isinstance(bins, list):
        #     bins = np.array(bins)

        x_avg = bins[:-1] + np.diff(bins) / 2
        y_avg, y16, y84 = (np.nan * np.zeros(len(x_avg)) for i in range(3))
        for k, ll, ul in zip(range(len(x_avg)), bins[:-1], bins[1:]):
            y_sel = y[(ll < x) & (x <= ul)]
            y_avg[k] = self.p.avg_fun(y_sel)
            y16[k] = np.nanquantile(y_sel, q=0.16)
            y84[k] = np.nanquantile(y_sel, q=0.84)
        errors = {
            x_dim.name: x_avg,
            y_dim.name: np.array(y_avg),
            'y16': np.array(y_avg) - np.array(y16),
            'y84': np.array(y84) - np.array(y_avg)
        }
        return hv.ErrorBars(errors, kdims=[x_dim], vdims=[y_dim, 'y16', 'y84'])
Exemplo n.º 2
0
def get_gene_stats(xvals, col_idxs, tissues):
    """
    Compute summary stats across all samples for a given gene & tissue
    """

    xmin, xq1, xmed, xmean, xq3, xmax, xsd, xmad = [], [], [], [], [], [], [], []

    for tissue in tissues.keys():
        tidx = [col_idxs[s] for s in tissues[tissue] if s in col_idxs.keys()]
        if len(tidx) > 0:
            tvals = xvals[tidx]
            xmin.append(np.nanmin(tvals))
            xq1.append(np.nanquantile(tvals, q=0.25))
            xmed.append(np.nanmedian(tvals))
            xmean.append(np.nanmean(tvals))
            xq3.append(np.nanquantile(tvals, q=0.75))
            xmax.append(np.nanmax(tvals))
            xsd.append(np.nanstd(tvals))
            xmad.append(mad(tvals))
        else:
            xmin.append(np.nan)
            xq1.append(np.nan)
            xmed.append(np.nan)
            xmean.append(np.nan)
            xq3.append(np.nan)
            xmax.append(np.nan)
            xsd.append(np.nan)
            xmad.append(np.nan)

    return xmin, xq1, xmed, xmean, xq3, xmax, xsd, xmad
def plot_prop_pdf_stack(source):
    pmh = ParmapHandler(source)
    props = ['tkin', 'texc', 'ncol', 'sigm', 'vcen']
    all_bins = [
        np.linspace(lo, hi, 100) for lo, hi in [
            (7.0, 25.0),  # tkin, K
            (2.7, 15.0),  # texc, K
            (12.0, 17.0),  # ncol, log(cm^-2)
            (0.0, 2.0),  # sigm, km/s
            (-3.0, 3.0),  # vcen, km/s (relative)
        ]
    ]
    fig, axes = plt.subplots(ncols=1, nrows=len(props), figsize=(4, 6))
    for prop, bins, ax in zip(props, all_bins, axes):
        data = pmh.get_hdu(prop, rel_velo=True).data
        vals = data.flatten()
        med = np.nanmedian(vals)
        qlo = np.nanquantile(vals, 0.165)
        qhi = np.nanquantile(vals, 0.835)
        hist, _, _ = ax.hist(vals, bins=bins, density=True, color='0.3')
        ax.vlines(
            [qlo, med, qhi],
            0,
            hist.max(),
            linestyles=['dotted', 'dashed', 'dotted'],
            colors='red',
        )
        ax.set_xlim(bins.min(), bins.max())
        ax.set_xlabel(pmh.get_label(prop))
    ax.set_ylabel('PDF')
    plt.tight_layout(h_pad=0.5)
    save_figure(f'{source}_prop_pdf_stack', do_eps=False)
Exemplo n.º 4
0
def replace_outliers_iqr(pivot_outliers, k):
    '''Replace outliers using an iqr deviation method'''

    pivot_no_outliers = pd.DataFrame(columns=pivot_outliers.columns,
                                     index=pivot_outliers.index)
    pivot_no_outliers.rename(columns={'with_outliers': 'without_outliers'},
                             level=0,
                             inplace=True)

    for x in pivot_outliers.index:
        values = pivot_outliers.loc[x, :].values
        if np.nanstd(values) != 0 and np.isnan(values).sum() != len(values):
            Q1 = np.nanquantile(values, 0.25)
            Q3 = np.nanquantile(values, 0.75)
            IQR = Q3 - Q1
            LB = Q1 - k * IQR
            UB = Q3 + k * IQR
            new_values = np.where((values < LB) | (values > UB),
                                  np.nanmedian(values), values)
        else:
            new_values = values

        pivot_no_outliers.iloc[
            pivot_outliers.index.get_loc(x), :] = new_values.astype('float')

    return pivot_no_outliers
Exemplo n.º 5
0
Arquivo: dataset.py Projeto: rizac/sod
def _dfinfo(dataframe, asstring=True):
    '''Returns a a dataframe with info about the given `dataframe`
    '''

    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    sum_df = odict()
    # if _dfr.empty
    for col in floatingcols(dataframe):
        q01 = np.nanquantile(dataframe[col], 0.01)
        q99 = np.nanquantile(dataframe[col], 0.99)
        df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)]
        # segs1 = len(pd.unique(df1[ID_COL]))
        # segs99 = len(pd.unique(df99[ID_COL]))
        # stas1 = len(pd.unique(df1['station_id']))
        # stas99 = len(pd.unique(df99['station_id']))

        sum_df[col] = {
            infocols[0]: np.nanmin(dataframe[col]),
            infocols[1]: np.nanquantile(dataframe[col], 0.5),
            infocols[2]: np.nanmax(dataframe[col]),
            infocols[3]: (~np.isfinite(dataframe[col])).sum(),
            infocols[4]: len(df1),
            infocols[5]: len(df99)
            # columns[5]: stas1 + stas99,
        }
    return pd.DataFrame(data=list(sum_df.values()),
                        columns=infocols,
                        index=list(sum_df.keys()))
Exemplo n.º 6
0
def search_assemble_mld_radavg(wod_dbase,
                               lon_arr,
                               lat_arr,
                               kmrad=1e2,
                               crit=0.0528e0):
    avg_mlds = []
    std_mlds = []
    min_mlds = []
    max_mlds = []
    for n, (lonc, latc) in enumerate(zip(lon_arr, lat_arr)):
        print(n, lonc, latc)
        locs = lonlat_inside_km_radius(wod_dbase['lon'], wod_dbase['lat'],
                                       (lonc, latc), kmrad)
        wod_loc_subset = wod_dbase[locs]
        wod_loc_subset = quik_quality_control(wod_loc_subset)
        print("found %s good profiles in area" % len(wod_loc_subset))
        if len(wod_loc_subset) > 0:
            vars_arr = derive_variables(wod_loc_subset, which_ones='all')
            mlds = [
                calc_mld(SA, CT, P, crit=crit) for SA, CT, P in zip(
                    vars_arr[0], vars_arr[1], wod_loc_subset['pres'])
            ]
            avg_mlds.append(np.nanmedian(mlds))
            std_mlds.append(np.nanstd(mlds))
            min_mlds.append(np.nanquantile(mlds, .05))
            max_mlds.append(np.nanquantile(mlds, .95))

    return np.asarray(avg_mlds), np.asarray(std_mlds), np.asarray(
        min_mlds), np.asarray(max_mlds)
Exemplo n.º 7
0
def stat_summarizer(figure):
    avg_perf = np.nanmean(figure)
    min_perf = np.nanmin(figure)
    q1_perf = np.nanquantile(figure, 0.25)
    med_perf = np.nanmedian(figure)
    q3_perf = np.nanquantile(figure, 0.75)
    max_perf = np.nanmax(figure)
    stdev = np.nanstd(figure)
    medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit')
    sharpe = avg_perf / stdev
    sharpemad = med_perf / medianabdev

    finaldict = {
        'avg_perf': avg_perf,
        'med_perf': med_perf,
        'stdev': stdev,
        'medianabdev': medianabdev,
        'min_perf': min_perf,
        'max_perf': max_perf,
        'q1_perf': q1_perf,
        'q3_perf': q3_perf,
        'sharpe': sharpe,
        'sharpemad': sharpemad
    }

    return finaldict
Exemplo n.º 8
0
def _make_area_buffer(d_x, d_y, q=1):
    n_rows, n_cols = d_y.shape
    with np.errstate(divide='ignore'):
        y_min = np.nanquantile(d_y, 1 - q, 0)
        y_max = np.nanquantile(d_y, q, 0)
    # mean = np.nanmedian(d_y)
    # y_min[np.isnan(y_min)] = mean
    # y_max[np.isnan(y_max)] = mean
    masks = using_clump(y_min)
    # polygon = np.concatenate((np.dstack((d_x,y_max))[0],np.dstack((d_x[::-1],y_min[::-1]))[0]))
    mesh_vertice = []
    mesh_face = []
    last_index = 0
    for m in masks:
        _max = np.vstack((d_x[m], y_max[m])).T
        _min = np.vstack((d_x[m][::-1], y_min[m][::-1])).T
        # p = np.concatenate((_max,_min)).tolist()
        mv, mf = polygon2mesh(_max, _min)
        if len(mf) == 0:
            continue
        mf += last_index
        mesh_vertice.append(mv)
        mesh_face.append(mf)
        last_index = mf[-1, -1] + 1
    if len(mesh_vertice) == 0:
        return None, None
    return np.concatenate(mesh_vertice), np.concatenate(mesh_face)
Exemplo n.º 9
0
def bootstrap_ci(estimate,
                 straps,
                 alpha=0.05,
                 method='pivot',
                 axis=0,
                 stack=True):
    """
    Return pivot CIs
      This confidence interval returned is a pivotal CIs,
          C_l = 2 T - Q(1-α/2)
          C_u = 2 T - Q(α/2)
       where T is the estimator for the stastistic T, and α is the confidence level,
       and Q(x) is the empirical x percentile across the bootstraps.
    """
    qlower, qupper = (np.nanquantile(straps, alpha / 2, axis=axis),
                      np.nanquantile(straps, 1 - alpha / 2, axis=axis))
    if method == 'percentile':
        CIs = qlower, estimate, qupper
    elif method == 'pivot':
        CIs = 2 * estimate - qupper, estimate, 2 * estimate - qlower
    else:
        raise ValueError("method must be either 'pivot' or 'percentile'")
    if stack:
        return np.stack(CIs)
    return CIs
Exemplo n.º 10
0
def stat_summarizer_old(figure):
    avg_perf = np.nanmean(figure)
    min_perf = np.nanmin(figure)
    q1_perf = np.nanquantile(figure, 0.25)
    med_perf = np.nanmedian(figure)
    q3_perf = np.nanquantile(figure, 0.75)
    max_perf = np.nanmax(figure)
    iqr_perf = q3_perf - q1_perf
    max_min = max_perf - min_perf
    maxq3 = max_perf - q3_perf
    q1min = q1_perf - min_perf
    stdev = np.nanstd(figure)
    medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit')

    finaldict = {
        'avg_perf': avg_perf,
        'min_perf': min_perf,
        'q1_perf': q1_perf,
        'med_perf': med_perf,
        'q3_perf': q3_perf,
        'max_perf': max_perf,
        'iqr_perf': iqr_perf,
        'max_min': max_min,
        'maxq3': maxq3,
        'q1min': q1min,
        'stdev': stdev,
        'medianabdev': medianabdev
    }

    return finaldict
Exemplo n.º 11
0
def _inlier_range(series):
    low = np.nanquantile(series, 0.01)
    high = np.nanquantile(series, 0.99)
    assert low <= high
    # the two is a complete hack
    inner_range = (high - low) / 2
    return low - inner_range, high + inner_range
Exemplo n.º 12
0
 def scatter_plot_by_wind(wind_low_threshold, wind_up_threshold, x, y,
                          raster_variable_name, station_name):
     sns.set(rc={'figure.figsize': (9, 5)})
     sns.set_theme(style="white")
     scatter_file = os.path.join(
         dir_comparison_plots,
         raster_variable_name + "_wind_%s_%s_station_%s_scatterplot.png" %
         (wind_low_threshold, wind_up_threshold, station_name))
     # scatterplot
     if len(x) == 0 or len(y) == 0:
         return
     try:
         m, b = np.polyfit(x, y, 1)
     except np.linalg.LinAlgError:  # only zeros (nans)
         return
     regress = linregress(x, y)
     plt.plot(x, m * x + b, color="#2b2b2b")
     sns.scatterplot(x, y, color="#c404ab")
     plt.axes().xaxis.set_tick_params(labelsize=8)
     plt.axes().yaxis.set_tick_params(labelsize=8)
     plt.text(np.nanquantile(x, [0.025])[0],
              np.nanquantile(y, [0.9])[0],
              "Lin. regression\nr-value: %s\nslope: %s" %
              (np.round(regress.rvalue, 2), np.round(regress.slope, 2)),
              fontsize=8)
     plt.ylabel("S2 trucks")
     plt.xlabel(raster_variable_name)
     plt.title("UBA station %s | Wind direction %s-%s" %
               (station_name, wind_low_threshold, wind_up_threshold))
     plt.savefig(scatter_file, dpi=300)
     plt.close()
Exemplo n.º 13
0
def thr_IQR(x, times=3, series=False, exclude_zero=True):
    """ 
    if series is True, the last axis should be series 
    """

    if series is False:
        x = x[..., None]

    if exclude_zero is True:
        qu = np.asarray([
            np.nanquantile(x[..., i][x[..., i] != 0], 0.75)
            for i in range(x.shape[-1])
        ])
        ql = np.asarray([
            np.nanquantile(x[..., i][x[..., i] != 0], 0.25)
            for i in range(x.shape[-1])
        ])
    else:
        qu = np.asarray(
            [np.nanquantile(x[..., i], 0.75) for i in range(x.shape[-1])])
        ql = np.asarray(
            [np.nanquantile(x[..., i], 0.25) for i in range(x.shape[-1])])

    x_post = copy.deepcopy(x)
    x_post[x_post > (qu + times * (qu - ql))] = np.nan
    x_post[x_post < (ql - times * (qu - ql))] = np.nan

    if series is False:
        return x_post[..., 0]
    else:
        return x_post
Exemplo n.º 14
0
def _dfinfo(dataframe):
    '''Returns a dataframe with statistical info about the given `dataframe`
    '''
    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0]
    sum_df = odict()
    # if _dfr.empty
    for col in floatingcols(dataframe):
        colvalues = defaultcolvalues
        if not dataframe.empty:
            q01 = np.nanquantile(dataframe[col], 0.01)
            q99 = np.nanquantile(dataframe[col], 0.99)
            df1, df99 = dataframe[(dataframe[col] <
                                   q01)], dataframe[(dataframe[col] > q99)]
            colvalues = [
                np.nanmin(dataframe[col]),  # Min
                np.nanquantile(dataframe[col], 0.5),  # Median
                np.nanmax(dataframe[col]),  # Max
                (~np.isfinite(dataframe[col])).sum(),  # #NAs
                len(df1),  # #<1Perc.
                len(df99)  # @>99Perc.
            ]

        sum_df[col] = {i: v for i, v in zip(infocols, colvalues)}

    return pd.DataFrame(data=list(sum_df.values()),
                        columns=infocols,
                        index=list(sum_df.keys()))
Exemplo n.º 15
0
    def _get_target_neighbors(self, df_sched_expected_, n_neighbors=2):
        agg_funcs = {'mean': lambda x: np.mean(x, axis=1),
                     'min': lambda x: np.min(x, axis=1),
                     'max': lambda x: np.max(x, axis=1),
                     'q25': lambda x: np.nanquantile(x, 0.25, axis=1),
                     'median': lambda x: np.nanquantile(x, 0.5, axis=1),
                     'q75': lambda x: np.nanquantile(x, 0.75, axis=1),
                     'std': lambda x: np.std(x, axis=1),
                     # 'mean_diff': lambda x: np.nanmean(np.diff(x.fillna(method='pad', axis=1), axis=1), axis=1),
                     'count': lambda x: np.sum(~np.isnan(x), axis=1),
                     'sum': lambda x: np.sum(x, axis=1)}

        df_sim = pd.DataFrame()

        print(f"Calculating aggregate statistics for {self.target_value} behavior.")
        for agg in agg_funcs.keys():
            func = agg_funcs[agg]
            df_sim[agg] = func(df_sched_expected_)

        X_sim, _ = self._pca_reduction(df_sim)

        print(f"Computing {self.target_value} nearest neighbors by similarity in aggregate statistics.")
        neighbors = NearestNeighbors(n_neighbors=n_neighbors)
        neighbors.fit(X_sim)
        distances, indices = neighbors.kneighbors(X_sim)

        return distances, indices
Exemplo n.º 16
0
def scatterplot(xs, ys, xlabel, ylabel, id_line=False, linewidth=1, ax=None):
    """
    General scatterplot function

    :param xs:
    :type xs: array-like
    :param ys:
    :type ys: array-like
    :param xlabel:
    :param ylabel:
    :param id_line: boolean, whether or not to plot identity line
    :param linewidth:
    :param ax:
    :return: figure handle or Axes object
    """
    return_fig = False
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(4, 4))
        return_fig = True
    if id_line:
        lmin = np.nanmin([np.nanquantile(xs, 0.01), np.nanquantile(ys, 0.01)])
        lmax = np.nanmax([np.nanquantile(xs, 0.99), np.nanquantile(ys, 0.99)])
        ax.plot([lmin, lmax], [lmin, lmax], '-', color=[0.7, 0.7, 0.7], linewidth=linewidth)
    ax.scatter(xs, ys, marker='.', s=150, edgecolors=[1, 1, 1], alpha=1.0, color='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if return_fig:
        plt.show()
        return fig
    else:
        return ax
Exemplo n.º 17
0
def quick_min_max(x, q=None):
    """Estimate the min/max values of input by down-sampling.

    :param numpy.ndarray x: data, 2D array for now.
    :param float/None q: quantile when calculating the min/max, which
        must be within [0, 1].

    :return tuple: (min, max)
    """
    if not isinstance(x, np.ndarray):
        raise TypeError("Input must be a numpy.ndarray!")

    if x.ndim != 2:
        raise ValueError("Input must be a 2D array!")

    while x.size > 1e5:
        sl = [slice(None)] * x.ndim
        sl[np.argmax(x.shape)] = slice(None, None, 2)
        x = x[tuple(sl)]

    if q is None:
        return np.nanmin(x), np.nanmax(x)

    if q < 0.5:
        q = 1 - q

    # Let np.nanquantile to handle the case when q is outside [0, 1]
    # caveat: nanquantile is about 30 times slower than nanmin/nanmax
    return np.nanquantile(x, 1 - q, interpolation='nearest'), \
           np.nanquantile(x, q, interpolation='nearest')
Exemplo n.º 18
0
def silverman_bw(x):
    import numpy as np
    s = np.nanstd(x)
    IQR = np.nanquantile(df.iloc[:, 0], .75) - np.nanquantile(
        df.iloc[:, 0], .25)
    A = np.min([s, IQR / 1.349])
    n = np.count_nonzero(~np.isnan(x))
    return 0.9 * A * (n**(-0.2))
Exemplo n.º 19
0
def getBounds(varName, data=[]):
    bounds = (None, None)
    if len(data) > 0:
        data = data.flatten()
        bounds = (np.nanquantile(data, 0.05), np.nanquantile(data, 0.95))
    else:
        bounds = getBoundsEx(varName)
    return bounds
Exemplo n.º 20
0
def measurement(predictions, signals, supervision_type = "classification", **args):
    """
    """
    if supervision_type == "classification":
        return np.mean(predictions - signals == 0)
    elif supervision_type == "l2":
        return np.mean(np.square(predictions - signals)) * 0.5
    elif supervision_type == "hit-MR":
        top_K = args["top_K"]
        info_df = args["info_df"]

        item_info_dict = dict(zip(list(info_df.take([0], axis = 1).values.flatten()), list(info_df.take([1], axis = 1).values.flatten())))
        
        n_usrs = signals.shape[0]
        n_results = len(top_K) + 1
        
        results = np.zeros((n_usrs, n_results))
        start_time = time.time()
        for idx, row in signals.iterrows():            
            try:
                retrieved_items = [item_info_dict[item_id] for item_id in predictions[row["usr_id"]][0].split(",") if item_id in item_info_dict]
                n_level, n_trigger_item = predictions[row["usr_id"]][1]

                test_items = set([item_info_dict[item_id] for item_id in row["test_item"].split(",") if item_id in item_info_dict])
                n_test = len(test_items)               
                
                assert n_trigger_item >= 1
                assert n_test >= 1
            except:
                results[idx] = np.array([np.nan] * n_results)                
                continue
            
            n_total_retrieved = n_level * n_trigger_item
    
            # compute hit scores
            for j, K in enumerate(top_K):
                bucket_size = int(np.ceil(int(K) * 1.0/n_trigger_item)) * n_trigger_item
                results[idx][j] = len(test_items.intersection(set(retrieved_items[: bucket_size])))/n_test
                
            # compute mean rank
            for itm in test_items:
                if itm in retrieved_items:
                    results[idx][n_results - 1] += (int(retrieved_items.index(itm)/n_trigger_item) + 0.5) * n_trigger_item
                else:
                    results[idx][n_results - 1] += n_total_retrieved
                results[idx][n_results - 1] = results[idx][n_results - 1]/n_test

            if idx % 10000 == 0:
                print("User id: {idx}; Elapsed time: {elapsed_time}s.".format(idx = idx, elapsed_time = time.time() - start_time))


        return {'mean': np.nanmean(results, axis = 0),\
                'std': np.nanstd(results, axis = 0),\
                'Q25': np.nanquantile(results, 0.25, axis = 0),\
                'Q50': np.nanquantile(results, 0.5, axis = 0),\
                'Q75': np.nanquantile(results, 0.75, axis = 0),\
                'Q90': np.nanquantile(results, 0.9, axis = 0),\
                'Q95': np.nanquantile(results, 0.95, axis = 0)}
Exemplo n.º 21
0
def qq_correction(df_m, estacion):
    '''
    This function receives dataframes of model correct the values
    using the quantile-quantile technique.
    The df_m DataFrame MUST contain two columns:
    Fecha: The date in a datetime format Pandas
    Variable: The values of the variable

    This function retrieves the historical values of model and observation
    and generate the ECDF for both of them.
    After this, for each value in df_m, it adds a new column, with the
    corrected value.

    Reference: Boe et al., 2007. 'Statistical and dynamical downscaling of the
    Seine basin climate for hydro-meteorological studies'
    '''
    # Check columns
    columnas = df_m.columns
    list2 = ['Fecha', 'tmax', 'tmin', 'radsup', 'velviento', 'hr']
    result = any(elem in columnas for elem in list2)
    if result and len(columnas) == 2:
        print(' ################# Correccion Q-Q ', columnas[-1],
              ' #################')
        tot_val = len(df_m)
        df_m = df_m.assign(month=pd.DatetimeIndex(df_m.loc[:, 'Fecha']).month)
        corrected_values = np.empty(tot_val)
        corrected_values[:] = np.nan
        # Limit for CDF
        cdf_limite = .99999999
        # Go for each row with data and make the correction according to month
        df_m.reset_index(drop=True, inplace=True)
        for index, row in df_m.iterrows():
            ecdf_m, datos_m, ecdf_o, datos_o = get_ecdf(
                columnas[-1], estacion, row.month)
            dato = row[columnas[-1]]  #Last column is data, first is Fecha
            p = ecdf_m(dato)
            if p > cdf_limite:
                p = cdf_limite
            corr_o = np.nanquantile(datos_o, p, interpolation='linear')
            corr_m = np.nanquantile(datos_m, p, interpolation='linear')
            corrected_values[index] = dato + (corr_o - corr_m)
        # End of Loop
        df_out = df_m.loc[:, [columnas[0], columnas[1]]].copy()
        df_out = df_out.assign(corregido=corrected_values)
        df_out.columns = ['Fecha', columnas[-1], columnas[-1] + '_corr']

        return df_out
    else:
        err_txt = '''
                     ########### ERROR ##############\n
        No estan todas las columnas para hacer correccion Q-Q con datos.\n
                                exit()\n
                     ################################

                  '''
        print(err_txt)
        exit()
def main(args=None):
    """
    Main function to generate the polarization plot.
    """
    args = parse_arguments().parse_args(args)
    matplotlib.rcParams['pdf.fonttype'] = 42

    pc1 = pd.read_table(args.pca,
                        header=None,
                        sep="\t",
                        dtype={
                            0: "object",
                            1: "Int64",
                            2: "Int64",
                            3: "float32"
                        })

    pc1 = pc1.rename(columns={0: "chr", 1: "start", 2: "end", 3: "pc1"})

    if args.outliers != 0:
        quantile = [args.outliers / 100, (100 - args.outliers) / 100]
        boundaries = np.nanquantile(pc1['pc1'].values.astype(float), quantile)
        quantiled_bins = np.linspace(boundaries[0], boundaries[1],
                                     args.quantile)
    else:
        quantile = [j / (args.quantile - 1) for j in range(0, args.quantile)]
        quantiled_bins = np.nanquantile(pc1['pc1'].values.astype(float),
                                        quantile)

    pc1["quantile"] = np.searchsorted(quantiled_bins,
                                      pc1['pc1'].values.astype(float),
                                      side="right")
    pc1.loc[pc1["pc1"] == np.nan]["quantile"] = args.quantile + 1

    polarization_ratio = []
    output_matrices = []
    labels = []
    for matrix in args.obsexp_matrices:
        obs_exp = hm.hiCMatrix(matrix)
        pc1["bin_id"] = pc1.apply(lambda row: get_indices(obs_exp, row),
                                  axis=1)
        name = ".".join(matrix.split("/")[-1].split(".")[0:-1])
        labels.append(name)
        normalised_sum_per_quantile = count_interactions(
            obs_exp, pc1, args.quantile, args.offset)
        normalised_sum_per_quantile = np.nan_to_num(
            normalised_sum_per_quantile)
        if args.outputMatrix:
            output_matrices.append(normalised_sum_per_quantile)

        polarization_ratio.append(
            within_vs_between_compartments(normalised_sum_per_quantile,
                                           args.quantile))
    if args.outputMatrix:
        np.savez(args.outputMatrix, [matrix for matrix in output_matrices])
    plot_polarization_ratio(polarization_ratio, args.outputFileName, labels,
                            args.quantile)
Exemplo n.º 23
0
def _quantile(arr, q):
    if arr.ndim == 1:
        out = np.empty((q.size, ), dtype=arr.dtype)
        out[:] = np.nanquantile(arr, q)
    else:
        out = np.empty((arr.shape[0], q.size), dtype=arr.dtype)
        for index in range(out.shape[0]):
            out[index] = np.nanquantile(arr[index], q)
    return out
Exemplo n.º 24
0
def getmidr(traj, thr):
    coords = np.reshape(traj, (-1, 2))
    minx = np.nanquantile(coords[:, 0], thr)
    maxx = np.nanquantile(coords[:, 0], 1 - thr)
    miny = np.nanquantile(coords[:, 1], thr)
    maxy = np.nanquantile(coords[:, 1], 1 - thr)
    mid = np.array([(maxx + minx) / 2, (maxy + miny) / 2])
    r = np.sqrt((coords[:, 0] - mid[0])**2 + (coords[:, 1] - mid[1])**2)
    return mid, np.nanquantile(r, 1 - thr)
Exemplo n.º 25
0
 def fit(self, X: Union[np.ndarray, pd.DataFrame], y=None):
     assert self.factor >= 0
     X_ = np.asarray(X)
     self.mean_ = np.nanmedian(X_, axis=0)
     self.high_q_ = np.nanquantile(X_, self.high_quantile, axis=0)
     self.low_q_ = np.nanquantile(X_, self.low_quantile, axis=0)
     self.high_ = (self.high_q_ - self.mean_) * self.factor + self.mean_
     self.low_ = (self.low_q_ - self.mean_) * self.factor + self.mean_
     return self
Exemplo n.º 26
0
def histme(x, y, color, **kwargs):
    rbins = (np.array([8, 5.7]) * 20).astype(int)
    # plotrange = [[np.nanquantile(x, .0005), np.nanquantile(x, .9995)],
    #              [np.nanquantile(y, .0005), np.nanquantile(y, .9995)]]
    plotrange = [[np.nanquantile(x, .0005),
                  np.nanquantile(x, .9995)], [0.01, 0.99]]

    # plt.hist2d(x, y, range=plotrange, bins=rbins, norm=colors.LogNorm(), cmap="Blues")
    plt.hist2d(x, y, range=plotrange, bins=rbins, cmap="Blues")
    plt.ylim(0, 1)
Exemplo n.º 27
0
    def __init__(self, *args, **kwargs):
        tkinter.Tk.__init__(self, *args, **kwargs)
        self.resizable(width=False, height=False)

        self.funkcje = ('Wskaźnik skośności', 'Pozycyjny wskaźnik skośności',
                        'Pozycyjny współczynnik asymetrii',
                        'Klasyczny współczynnik asymetrii',
                        'Współczynnik kurtozy', 'Współczynnik ekscesu')
        self.save = []

        for x in range(len(selected_header)):
            sko = stats.skew(selected_data.iloc[:, x])
            y = np.sort(selected_data.iloc[:, x])
            poz_sko = np.nanquantile(y, q=0.75) + np.nanquantile(
                y, q=0.25) - 2 * (np.nanmedian(y))
            poz_asy = poz_sko / (np.nanquantile(y, q=0.75) -
                                 np.nanquantile(y, q=0.25))
            mean = np.nanmean(selected_data.iloc[:, x])
            a = 0
            for i in range(selected_data.shape[0]):
                a = a + ((selected_data.iloc[i, x] - mean)**3)
                m3 = a / selected_data.shape[0]
            kla_asy = m3 / (np.nanstd(selected_data.iloc[:, x])**3)
            kurtoza = stats.kurtosis(selected_data.iloc[:, x],
                                     axis=0,
                                     fisher=False)
            k1 = (stats.kurtosis(
                selected_data.iloc[:, x], axis=0, fisher=False)) - 3

            self.Wyniki = []

            self.Wyniki.append(sko)
            self.Wyniki.append(poz_sko)
            self.Wyniki.append(poz_asy)
            self.Wyniki.append(kla_asy)
            self.Wyniki.append(kurtoza)
            self.Wyniki.append(k1)

            self.save.append(self.Wyniki)

            wypelanianie_tabeli_w_petli(len(self.funkcje), self, x)

        tworzenie_tabel_w_petli(selected_header, self, poziom='True')

        tworzenie_tabel_w_petli(self.funkcje, self, poziom='False')

        self.l1 = Button(self, text='Zapisz wyniki', command=self.zapisz)
        self.l1.grid(row=len(self.funkcje) + 3,
                     column=len(selected_header) + 1,
                     pady=10,
                     sticky=W)

        self.wolny = Label(self, text=' ', padx=10, pady=10)
        self.wolny.grid(row=len(self.funkcje) + 3,
                        column=len(selected_header) + 3)
Exemplo n.º 28
0
def list_aggregator(aggregatemethod, all_data):

    # AGGREGATE METHOD
    if aggregatemethod == 'mean':
        answer = np.nanmean(all_data)
    if aggregatemethod == 'median':
        answer = np.nanmedian(all_data)
    if aggregatemethod == 'minimum':
        answer = np.nanmin(all_data)
    if aggregatemethod == 'q1':
        answer = np.nanquantile(all_data, 0.25)
    if aggregatemethod == 'q3':
        answer = np.nanquantile(all_data, 0.75)
    if aggregatemethod == 'maximum':
        answer = np.nanmax(all_data)
    if aggregatemethod == 'stdev':
        answer = np.nanstd(all_data)
    if aggregatemethod == 'medianabdev':
        answer = stats.median_absolute_deviation(all_data, nan_policy='omit')
    if aggregatemethod == 'iqr':
        q1_perf = np.nanquantile(all_data, 0.25)
        q3_perf = np.nanquantile(all_data, 0.75)
        answer = q3_perf - q1_perf
    if aggregatemethod == 'range':
        min_perf = np.nanmin(all_data)
        max_perf = np.nanmax(all_data)
        answer = max_perf - min_perf
    if aggregatemethod == 'maxq3':
        max_perf = np.nanmax(all_data)
        q3_perf = np.nanquantile(all_data, 0.75)
        answer = max_perf - q3_perf
    if aggregatemethod == 'q1min':
        q1_perf = np.nanquantile(all_data, 0.25)
        min_perf = np.nanmin(all_data)
        answer = q1_perf - min_perf
    if aggregatemethod == 'q3q1avg':
        q1_perf = np.nanquantile(all_data, 0.25)
        q3_perf = np.nanquantile(all_data, 0.75)
        answer = (q3_perf + q1_perf) / 2
    if aggregatemethod == 'q3q1avgoveriqr':
        q1_perf = np.nanquantile(all_data, 0.25)
        q3_perf = np.nanquantile(all_data, 0.75)
        iqr = q3_perf - q1_perf
        answer = ((q3_perf + q1_perf) / 2) / iqr
    if aggregatemethod == 'maxminavg':
        min_perf = np.nanmin(all_data)
        max_perf = np.nanmax(all_data)
        answer = (max_perf + min_perf) / 2
    if aggregatemethod == 'maxminavgoverrange':
        min_perf = np.nanmin(all_data)
        max_perf = np.nanmax(all_data)
        maxmin = max_perf - min_perf
        answer = ((max_perf + min_perf) / 2) / maxmin

    return answer
Exemplo n.º 29
0
def calc_weights(pop, urban, ntl, targets, more, access):
    """

    """

    # The calculated weights for each segment will go here

    if access["urban"] > 0.9:
        weights = np.ones_like(pop) * access["rural"]
        weights[urban >= 2] = access["urban"]
        return weights

    weights = np.zeros_like(pop)

    # Investigate each combination of urban/rural and four quartiles
    # of population density
    for loc in ["urban", "rural"]:
        for q in [0.25, 0.5, 0.75, 1]:

            # Values of 2 and 3 are considered urban
            if loc == "urban":
                condition_del = urban < 3
                access_level = access["urban"]
            else:
                condition_del = urban >= 3
                access_level = access["rural"]

            # Ignore errors from doing arr[arr < x] with nan values
            with np.errstate(invalid="ignore"):
                pop_temp = np.copy(pop)  # local copy of pop for this loop
                pop_temp[condition_del] = np.nan  # remove urban/rural
                pop_temp[targets == 0] = np.nan  # remove not electrified

                # Filter to only keep this quartile
                quant_below = np.nanquantile(pop_temp, q - 0.25)
                quant = np.nanquantile(pop_temp, q)
                pop_temp[pop_temp <= quant_below] = np.nan
                pop_temp[pop_temp > quant] = np.nan

                # Get the average brightness per person of the top x% for this quartile
                # Where x is the rural/urban access rate
                ntl_per_pop = ntl / pop_temp
                ntl_quant = min(max(1 - access_level - more[loc][q], 0), 1)
                ntl_cut = np.nanquantile(ntl_per_pop, ntl_quant)

                # Create a weights array and assign values accoring to the formula below
                w = np.zeros_like(pop)
                w = 1 - (ntl_cut - ntl_per_pop) / ntl_cut
                w[w > 0.95] = 0.95  # limit values to max 1
                w[np.isnan(w)] = 0

                # Add the sucessive weights to the main array
                weights += w

    return weights
Exemplo n.º 30
0
    def test_no_p_overwrite(self):
        # this is worth retesting, because quantile does not make a copy
        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
        p = p0.copy()
        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
        assert_array_equal(p, p0)

        p0 = p0.tolist()
        p = p.tolist()
        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
        assert_array_equal(p, p0)
    def test_no_p_overwrite(self):
        # this is worth retesting, beause quantile does not make a copy
        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
        p = p0.copy()
        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
        assert_array_equal(p, p0)

        p0 = p0.tolist()
        p = p.tolist()
        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
        assert_array_equal(p, p0)
    def test_regression(self):
        ar = np.arange(24).reshape(2, 3, 4).astype(float)
        ar[0][1] = np.nan

        assert_equal(np.nanquantile(ar, q=0.5), np.nanpercentile(ar, q=50))
        assert_equal(np.nanquantile(ar, q=0.5, axis=0),
                     np.nanpercentile(ar, q=50, axis=0))
        assert_equal(np.nanquantile(ar, q=0.5, axis=1),
                     np.nanpercentile(ar, q=50, axis=1))
        assert_equal(np.nanquantile(ar, q=[0.5], axis=1),
                     np.nanpercentile(ar, q=[50], axis=1))
        assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1),
                     np.nanpercentile(ar, q=[25, 50, 75], axis=1))
Exemplo n.º 33
0
def array_nanquantile_global(arr, q):
    return np.nanquantile(arr, q)
Exemplo n.º 34
0
 def time_nanquantile(self, array_size, percent_nans):
     np.nanquantile(self.arr, q=0.2)
 def test_basic(self):
     x = np.arange(8) * 0.5
     assert_equal(np.nanquantile(x, 0), 0.)
     assert_equal(np.nanquantile(x, 1), 3.5)
     assert_equal(np.nanquantile(x, 0.5), 1.75)