Пример #1
0
    def __init__(self,
                 obj,
                 groupby_obj=None,
                 keys=None,
                 axis=0,
                 level=None,
                 grouper=None,
                 exclusions=None,
                 selection=None,
                 as_index=True,
                 sort=True,
                 group_keys=True,
                 squeeze=False,
                 observed=False,
                 mutated=False,
                 grouper_cache=None):
        def fill_value(v, key):
            return v if v is not None or groupby_obj is None else getattr(
                groupby_obj, key)

        self.obj = obj
        self.keys = fill_value(keys, 'keys')
        self.axis = fill_value(axis, 'axis')
        self.level = fill_value(level, 'level')
        self.exclusions = fill_value(exclusions, 'exclusions')
        self.selection = selection
        self.as_index = fill_value(as_index, 'as_index')
        self.sort = fill_value(sort, 'sort')
        self.group_keys = fill_value(group_keys, 'group_keys')
        self.squeeze = fill_value(squeeze, 'squeeze')
        self.observed = fill_value(observed, 'observed')
        self.mutated = fill_value(mutated, 'mutated')

        if groupby_obj is None:
            groupby_kw = dict(keys=keys,
                              axis=axis,
                              level=level,
                              grouper=grouper,
                              exclusions=exclusions,
                              as_index=as_index,
                              group_keys=group_keys,
                              squeeze=squeeze,
                              observed=observed,
                              mutated=mutated)
            if not _HAS_SQUEEZE:  # pragma: no branch
                groupby_kw.pop('squeeze')

            if obj.ndim == 2:
                self.groupby_obj = DataFrameGroupBy(obj, **groupby_kw)
            else:
                self.groupby_obj = SeriesGroupBy(obj, **groupby_kw)
        else:
            self.groupby_obj = groupby_obj

        if grouper_cache:
            self.groupby_obj.grouper._cache = grouper_cache
        if selection:
            self.groupby_obj = self.groupby_obj[selection]

        self.is_frame = isinstance(self.groupby_obj, DataFrameGroupBy)
Пример #2
0
 def __init__(self, groupby):
     self.groupby = groupby
     self.grouper = groupby.grouper
     self.obj = groupby.obj
     # create the first df groupby so we can delegate from __getattr__
     f_ind = groupby.obj.items[0]
     self.first = DataFrameGroupBy(groupby.obj.ix[f_ind],
                                   grouper=self.grouper)
Пример #3
0
    def _add_stats_to_summary(self,
                              groupedby: DataFrameGroupBy,
                              fieldname: str,
                              filter_by_ns: bool = False) -> None:
        """Takes grouped stats and adds min, max, and median to stats"""

        _ = {self.ns[i].update({fieldname: []}) for i in self.ns.keys()}
        if filter_by_ns:
            _ = {
                self.ns[i][fieldname].append(groupedby[i].min())
                if i in groupedby else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
            # pylint disable=expression-not-assigned
            _ = {
                self.ns[i][fieldname].append(groupedby[i].max())
                if i in groupedby else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
            # pylint disable=expression-not-assigned
            _ = {
                self.ns[i][fieldname].append(groupedby[i].median(
                    numeric_only=False))
                if i in groupedby else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
        else:
            min_field = groupedby.min()
            max_field = groupedby.max()
            med_field = groupedby.median(numeric_only=False)

            _ = {
                self.ns[i][fieldname].append(min_field[i])
                if i in min_field else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
            _ = {
                self.ns[i][fieldname].append(max_field[i])
                if i in max_field else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
            _ = {
                self.ns[i][fieldname].append(med_field[i])
                if i in med_field else self.ns[i][fieldname].append(0)
                for i in self.ns.keys()
            }
Пример #4
0
def normalized_param_plots(
    param: str, dataframe_grouped: DataFrameGroupBy, example: bool = True
):
    """
    Plot radius or area normalized plots of param values.
    """
    param_normalized = f"{param}_norm"
    _, axes = plt.subplots(2, 1, figsize=utils.paper_figsize(0.8))

    target_datas = []
    for target, radius in utils.Utils.circle_names_with_diameter.items():
        if radius != 50:
            continue
        target_data = dataframe_grouped.get_group(target).copy()
        target_data[param_normalized] = target_data[param] / get_best_value(
            target_data, param, "radius"
        )
        target_data["radius_normalized"] = target_data["radius"] / max(
            target_data["radius"]
        )
        target_data["area_normalized"] = target_data["area"] / max(target_data["area"])
        # categorize based on radius
        target_data["radius Cat"] = [
            "full" if rad < max(target_data["radius"]) / 2 else "limited"
            for rad in target_data["radius"].values
        ]
        target_data["radius Cat"] = target_data["radius Cat"].astype("category")
        target_datas.append(target_data)

        # Plotting
        sns.scatterplot(
            data=target_data,
            x="radius_normalized",
            y=param_normalized,
            hue="radius Cat",
            ax=axes[0],
        )
        sns.scatterplot(
            data=target_data,
            x="area_normalized",
            y=param_normalized,
            hue="radius Cat",
            ax=axes[1],
        )
        if example:
            break

    for ax in axes:
        ax.legend().remove()

    target_datas_df = pd.concat(target_datas)
    g = sns.JointGrid(
        data=target_datas_df,
        x="area_normalized",
        y=param_normalized,
    )
    g.plot(sns.scatterplot, sns.histplot)
Пример #5
0
 def apply(self, func, *args, **kwargs):
     result = {}
     for key, df in self.obj.iteritems():
         grp = DataFrameGroupBy(df, grouper=self.grouper)
         if not callable(func):
             f = getattr(grp, func)
             res = f(*args, **kwargs)
         result[key] = res
     return Panel.from_dict(result)
def count_neighbors_within_distance_groups(
    grouped_distances: DataFrameGroupBy, ) -> DataFrame:
    """Count number of neighbors within each group of same-distance site-index pairs.

    :param grouped_distances: A data frame grouped over site-index pairs, subspecies
        pairs, and bin intervals.
    :return: A pandas ``DataFrame`` of neighbor counts aggregated over site-index pairs
        and separation distances.
    """
    return (grouped_distances.apply(
        lambda x: pd.to_numeric(arg=x["distance_ij"].count(),
                                downcast="integer")).rename("n").reset_index())
Пример #7
0
 def apply(self, func, *args, **kwargs):
     result = OrderedDict()
     for key, df in self.obj.items():
         grp = DataFrameGroupBy(df, grouper=self.grouper)
         f = func
         if not isinstance(func, collections.Callable):
             f = getattr(grp, func)
             res = f(*args, **kwargs)
         else:
             # call the grouper.apply cuz we will box our own data
             keys, data, mutated = grp.grouper.apply(f, df, grp.axis)
             res = box_data(keys, data)
         result[key] = res
     return box_data(result)
Пример #8
0
def radius_constrained_param(
    dataframe_grouped: DataFrameGroupBy, param: str, target: str
):
    """
    Visualize effect of radius.
    """
    target_data = dataframe_grouped.get_group(target).copy()
    target_data["Number of Traces Cat"] = pd.cut(target_data["Number of Traces"], 5)
    target_data["radius Cat"] = [
        "full" if rad < max(target_data["radius"]) / 2 else "limited"
        for rad in target_data["radius"].values
    ]
    target_data["radius Cat"] = target_data["radius Cat"].astype("category")
    sns.lmplot(data=target_data, x="radius", y=param, hue="radius Cat")
def roc_est_calculator(
        df_groups: DataFrameGroupBy,
        cfg: RocEstCalculatorCfg) -> Tuple[np.ndarray, np.ndarray]:
    """

    Args:
        df_groups:
        cfg:

    Returns:

    """
    if isinstance(cfg.extract_metric_func, str):
        base_metric = df_groups[cfg.extract_metric_func].mean()
    else:
        raise NotImplementedError(
            'Implemented, but not tested. Test before using it and remove this raise'
        )
        base_metric = df_groups.apply(cfg.extract_metric_func)

    corona_from_datetime = pd.to_datetime(cfg.corona_from_date_str)

    fp_for_tp_list = []
    cur_rand_seed = cfg.rand_seed
    for i_epoch in range(cfg.epochs_num):
        for i_grp, (group_id, df_grp) in enumerate(df_groups):
            cur_rand_seed = cur_rand_seed + 1
            df_grp_plus_corona = add_rand_infected_simulation(
                df_grp, df_grp.datetime >= corona_from_datetime, cur_rand_seed,
                cfg.corona_sim_cfg)
            if isinstance(cfg.extract_metric_func, str):
                cur_metric = df_grp_plus_corona[cfg.extract_metric_func].mean()
            else:
                cur_metric = cfg.extract_metric_func(df_grp_plus_corona)
            # We added corona, hence expect the metric to increase or be the same
            assert cur_metric >= base_metric.iloc[i_grp] - 1e-7
            if cfg.corona_sim_cfg.p_infection == 0.:
                # For probability 0 infection, we don't expect any change:
                assert np.isclose(cur_metric, base_metric.iloc[i_grp])
            tmp = base_metric.copy().drop(index=base_metric.index[i_grp])
            assert tmp.shape[0] == base_metric.shape[0] - 1
            # The following is the minimal FP threshold such that cur_metric will be above threshold
            cur_detected_for_fp_th = np.sum(
                tmp >= cur_metric) / (base_metric.shape[0] - 1)
            fp_for_tp_list.append(cur_detected_for_fp_th)

    fp_vec = np.asarray(sorted(fp_for_tp_list))
    tp_vec = np.linspace(0.0, fp_vec.size - 0.0, fp_vec.size) / fp_vec.size

    return fp_vec, tp_vec
Пример #10
0
def respect_event_resolution(grouper: DataFrameGroupBy, resolution):
    """Resample to make sure the df slice contains events with the same frequency as the given resolution.
    The input BeliefsDataFrame (see below) should represent beliefs about sequential sub-events formed by a single source
    at a single unique belief time.
    Extra beliefs are added with nan values.

    :Example:

    >>> df = df.groupby([pd.Grouper(freq="1D", level="event_start"), "belief_time", "source"]).pipe(respect_event_resolution, timedelta(hours=1))

    So don't pass a BeliefsDataFrame directly, but pipe it so that we receive a DataFrameGroupBy object, which we can
    iterate over to obtain a BeliefsDataFrame slice for a unique belief time, source and (in our example) day of
    events. We then make sure an event is stated explicitly for (in our example) each hour.
    """

    # We need to loop over each belief time in this slice, and reindex such that each subslice has rows for each event. Then recombine.

    # Get a list of n groups, one group for each belief_time with info about how we sliced and the actual slice
    groups = list(grouper.__iter__())

    # Describe the event_start bin for the slices (we take the first, because the slices share the same event_start bin)
    bin_size = grouper.keys[0].freq
    bin_start = groups[0][0][0]
    bin_end = bin_start + bin_size

    # Build up our new BeliefsDataFrame (by copying over and emptying the rows, the metadata should be copied over)
    df = groups[0][1].copy().iloc[0:0]
    for (group) in (
            groups
    ):  # Loop over the groups (we grouped by unique belief time and unique source)

        # Get the BeliefsDataFrame for a unique belief time and source
        df_slice = group[1]
        if not df_slice.empty:
            lvl0 = pd.date_range(
                start=bin_start,
                end=bin_end,
                freq=to_offset(resolution).freqstr,
                closed="left",
                name="event_start",
            )
            df = df.append(
                tb_utils.replace_multi_index_level(df_slice,
                                                   level="event_start",
                                                   index=lvl0,
                                                   intersection=True))

    return df
Пример #11
0
def set_truth(
        grouped: DataFrameGroupBy,
        right_source: "classes.BeliefSource") -> "classes.BeliefsDataFrame":
    """Overwrite the beliefs of each source by those of the given source.
    Terminology-wise, we say the given source is considered to be right,
    so it's beliefs contain the truth to be used as a reference for accuracy calculations.
    """

    # Pick out the group that is considered to contain the true observations
    gr_dict = dict(grouped.__iter__())
    if right_source in gr_dict:
        truth_group = gr_dict[right_source]
    else:
        raise KeyError("Source %s not found in BeliefsDataFrame." %
                       right_source)

    # Replace each original group with the truth group, while adding back the source for each original group
    gr_list = [
        tb_utils.replace_multi_index_level(truth_group, "source",
                                           pd.Index([key] * len(truth_group)))
        for key, group in grouped
    ]

    return pd.concat(gr_list)