Exemplo n.º 1
0
def normalize_numeric_feature(
    col: pd.core.series.Series,
    excluded_obs: Union[None, pd.core.series.Series] = None
) -> Tuple[pd.core.series.Series, List[float]]:
    """Scale numeric values to their empirical range.

    Args:
        col: A numeric Series.
        excluded_obs: True for observations to exclude when computing min/max.

    Returns:
        - A Series of floats.
        - A list containing the minimum and maximum values among the included
          observations.
    """
    if excluded_obs is None:
        col_subset = col[col.notnull()]
    else:
        col_subset = col[(~excluded_obs) & col.notnull()]
    if len(col_subset) > 0:
        minimum = np.nanmin(col_subset)
        maximum = np.nanmax(col_subset)
    else:
        minimum = np.nan
        maximum = np.nan
    normalized_col = process_numeric_feature(col, minimum, maximum)
    numeric_range = [minimum, maximum]
    return normalized_col, numeric_range
Exemplo n.º 2
0
def plot_stats(df: pd.core.series.Series,
               figsize: tuple,
               title: str,
               xlabel: str,
               ylabel: str,
               output_path: str,
               kind: str = 'bar') -> None:
    fig, ax = plt.subplots(figsize=figsize)
    if kind == 'bar':
        df.plot(
            kind=kind,
            rot=90,
            color='g',
            ec='black',
            alpha=0.5,
        )
    else:
        df.plot(
            kind=kind,
            color='g',
            alpha=0.5,
        )
    plt.title(title, size=14)
    plt.xlabel(xlabel, size=10)
    plt.ylabel(ylabel, size=10)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    fig.tight_layout()
    plt.savefig(output_path, format='png')
Exemplo n.º 3
0
def dist_plot(series: pd.core.series.Series,
              dropna: bool = True,
              sig: Optional[int] = None) -> NoReturn:
    """
        Given a pandas Series, generate a descriptive visualisation 
        with a boxplot and a histogram with a kde.
        By default, this function drops `nan` values. If you desire to
        handle them differently, you should do so beforehand and/or
        specify dropna=False.
    """

    if dropna:
        series = series.dropna()
    sig = sig or 0

    quarts = scipy.stats.mstats.mquantiles(series,
                                           [0.001, 0.25, 0.5, 0.75, 0.975])

    f, (ax_box,
        ax_hist) = plt.subplots(2,
                                sharex=True,
                                gridspec_kw={"height_ratios": (.25, .75)})
    sns.boxplot(series, ax=ax_box)
    sns.stripplot(series, color="orange", jitter=0.2, size=2.5, ax=ax_box)
    sns.distplot(series, ax=ax_hist, kde=True)
    ax_hist.axvline(series.mean())
    ax_hist.set_xticks(quarts)
    #ax_box.set(xlabel=f'Mean value : {int(series.mean())}')
    plt.title(
        f"Glycaemic Distribution μ = {round(series.mean(), sig)}, σ = {round(series.std(), sig)}"
    )
Exemplo n.º 4
0
def new_hybrid_interpolator(
    data: pd.core.series.Series,
    methods: Dict[str, float] = {
        "linear": 0.65,
        "spline": 0.35
    },
    direction: str = "forward",
    limit: int = 120,
    limit_area: Optional[str] = None,
    order: int = 2,
    **kw,
) -> pd.core.series.Series:
    """"""

    limit_area = limit_area or "inside"

    weight_sum = sum(weight for weight in methods.values())
    if not np.isclose(weight_sum, 1):
        raise Exception(f"Sum of weights {weight_sum} != 1")

    resampled: Dict[str, pd.core.series.Series] = {}

    for key, weight in methods.items():
        resampled.update({
            key:
            weight * data.interpolate(
                method=key, order=order, limit_area=limit_area, limit=limit)
        })

    return reduce(lambda x, y: x + y, resampled.values())
Exemplo n.º 5
0
def process_categorical_feature(col: pd.core.series.Series,
                                cat_map: dict) -> pd.core.frame.DataFrame:
    """Map categorical values to unsigned integers.

    Args:
        col: A pandas Series.
        cat_map: A dict containing a key for each unique value in col.

    Returns:
        A pandas Series of unsigned integers.
    """
    col = col.map(cat_map).fillna(0)
    n_bits = 8
    while col.max() >= 2**n_bits:
        n_bits *= 2
    col = col.astype("uint" + str(n_bits))
    return col
Exemplo n.º 6
0
def compute_metrics_for_binary_outcome(
        actuals: pd.core.series.Series,
        predictions: pd.core.series.Series,
        threshold_positive: Union[None, float] = 0.5,
        share_positive: Union[None, float] = None) -> OrderedDict:
    """Evaluate predicted probabilities against actual binary outcome values.

    Args:
        actuals: A Series representing actual Boolean outcome values.
        predictions: A Series of predicted probabilities of the respective
            outcome values.
        threshold_positive: None or a value in [0, 1] representing the minimum
            predicted probability considered to be a positive prediction;
            overridden by share_positive.
        share_positive: None or a value in [0, 1] representing the share of
            observations with the highest predicted probabilities considered to
            have positive predictions; probability ties may cause share of
            positives in output to exceed given value; overrides
            threshold_positive.

    Returns:
        An ordered dictionary containing key-value pairs for area under the
        receiver operating characteristic curve (AUROC), predicted and
        actual shares of observations with an outcome of True, and all
        elements of the confusion matrix.
    """
    metrics = OrderedDict()
    if actuals.any() and not actuals.all():
        metrics['AUROC'] = roc_auc_score(actuals, predictions)
    else:
        metrics['AUROC'] = np.nan
    metrics['Predicted Share'] = predictions.mean()
    metrics['Actual Share'] = actuals.mean()
    if actuals.empty:
        metrics['True Positives'], metrics['False Negatives'], \
            metrics['False Positives'], metrics['True Negatives'] = \
            [0, 0, 0, 0]
    else:
        if share_positive is not None:
            threshold_positive = np.quantile(predictions, 1 - share_positive)
        metrics['True Positives'], metrics['False Negatives'], \
            metrics['False Positives'], metrics['True Negatives'] = \
            confusion_matrix(actuals, predictions >= threshold_positive,
                             labels=[True, False]).ravel().tolist()
    return metrics
Exemplo n.º 7
0
    def clean_simple(cls, text_series: pd.core.series.Series, stopwords: List):
        """
        Clean each text :
        - puts each word in lower case,
        - does not keep words belonging to the stopwords list,
        - does not keep words with less than 2 characters,
        - keeps only letters and deletes numbers and special characters.
        """
        # create a list of words
        text_series = text_series.map(lambda text: word_tokenize(text))

        # puts each word in lower case, does not keep words belonging to the stopwords list, does not keep words with less than 2 characters,
        # keeps only letters and deletes numbers and special characters.
        text_series = text_series.map(lambda tok: [
            word.lower() for word in re.split(
                " ", re.sub(r"(\W+|_|\d+)", " ", " ".join(tok)))
            if word.lower() not in stopwords and len(word) > 1
        ])
        return text_series
Exemplo n.º 8
0
def transform_pctchange_clean_standardize_pd(
        x_all_raw: pd.core.series.Series) -> pd.core.series.Series:
    ''' Stationarize -> Clean -> Standardize '''
    # 0/ Stationarize
    x_all = x_all_raw.pct_change()
    # 1/ Clean
    x_all = clean_nan_inf_pd(x_all)
    # 2/ Standardize method 0/
    x_all = (x_all - x_all.mean()) / x_all.std()

    return x_all
Exemplo n.º 9
0
def extract_selection_number(c12n_row: pd.core.series.Series,
                             column_name: str
                             ):
    # eg, "data/sequences/M256/M/01971.seq"
    #                            ^^^^^
    col_value = c12n_row.get(column_name)
    match = re.search(r'([^/]+)\.seq$', col_value)
    if match:
        return int(match.group(1))
    else:
        return -1
Exemplo n.º 10
0
def process_categorical_feature(col: pd.core.series.Series,
                                cat_map: dict) -> pd.core.frame.DataFrame:
    """Map categorical values to unsigned integers.

    Args:
        col: A pandas Series.
        cat_map: A dict containing a key for each unique value in col.

    Returns:
        A pandas Series of unsigned integers.
    """
    warn(
        "Warning: process_categorical_feature is deprecated. "
        "Processors now convert categorical features to pandas Categorical type.",
        DeprecationWarning,
    )
    col = col.map(cat_map).fillna(0)
    n_bits = 8
    while col.max() >= 2**n_bits:
        n_bits *= 2
    col = col.astype("uint" + str(n_bits))
    return col
Exemplo n.º 11
0
def transform_pctchange_clean_standardize_sample_pd(
        x_all_raw: pd.core.series.Series, y_all_raw: pd.core.series.Series,
        idx_per_beg, idx_per_end) -> pd.core.series.Series:
    # Must be done in this order:

    #x_all = df_pair[sym_0_col].pct_change()
    #y_all = df_pair[sym_1_col].pct_change()
    #x_per = df_pair.loc[dtrange[0]:dtrange[1],sym_0_col].pct_change()#[1:]
    #y_per = df_pair.loc[dtrange[0]:dtrange[1],sym_1_col].pct_change()#[1:]

    # 0/ Stationarize
    x_all = x_all_raw.pct_change()
    y_all = y_all_raw.pct_change()

    # 1/ Clean
    #x_per, y_per = clean_nan_inf(x_per, y_per)
    x_all = clean_nan_inf_pd(x_all)
    y_all = clean_nan_inf_pd(y_all)

    # 2/ Standardize method 0/
    x_per = x_all.loc[idx_per_beg:idx_per_end]
    y_per = y_all.loc[idx_per_beg:idx_per_end]

    x_all = (x_all - x_all.mean()) / x_all.std()
    y_all = (y_all - y_all.mean()) / y_all.std()
    x_per = (x_per - x_per.mean()) / x_per.std()
    y_per = (y_per - y_per.mean()) / y_per.std()
    """
    # Standardize method 1/
    x_all = (x_all - x_all.mean())/x_all.std()
    y_all = (y_all - y_all.mean())/y_all.std()

    x_per = x_all.loc[idx_per_beg:idx_per_end]
    y_per = y_all.loc[idx_per_beg:idx_per_end]
    """

    return x_all, y_all, x_per, y_per
Exemplo n.º 12
0
    def __correct_with_filter(
        self,
        target: pd.core.series.Series,
        inplace: bool = False,
    ) -> np.ndarray:
        if not inplace:
            target = target.copy()
        group = target.name[0]
        target = target.values
        if not self.__check_should_correct(self.gmms[group]):
            return target
        label = self.gmms[group].predict(target)
        selector = (label == 0)
        target[selector] = 0

        return target
Exemplo n.º 13
0
def dist_plot(series: pd.core.series.Series, dropna: bool = True) -> NoReturn:
    """
        Given a pandas Series, generate a descriptive visualisation 
        with a boxplot and a histogram with a kde.
        By default, this function drops `nan` values. If you desire to
        handle them differently, you should do so beforehand and/or
        specify dropna=False.
    """
    
    if dropna:
        series = series.dropna()
    
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.25, .75)})
    sns.boxplot(series, ax=ax_box)
    sns.stripplot(series, color="orange", jitter=0.2, size=2.5, ax=ax_box)
    sns.distplot(series, ax=ax_hist, kde=True)
    ax_box.set(xlabel='')
def two_columns_display__classes_distribution(
        dataset_labels: pd.core.series.Series) -> None:
    """
    Displays (in an IPython cell output) 2 columns :
        - on the left, a table of classes distribution
          (incl. percentage column)
        - on the right, a barchart plot of classes distribution

    Parameters :
        - dataset_labels (pd.core.series.Series) :
            a collection of dataset records labels.
    """

    df = pd.DataFrame(dataset_labels.copy())
    df.reset_index(level=0, inplace=True)  # index-to-column
    df = df.groupby('sentiment').count() \
        .rename(columns = {'index':'count'})
    df['percentage'] = (df / len(dataset_labels) * 100)['count']

    df_html = df.reset_index(level=0).style.hide_index() \
        .set_properties(**{'width':'20em', 'text-align':'center'}) \
        .format({"count": "{:,}", "percentage": "{0:.2f}%"}) \
        .set_table_styles([dict(selector="th", props=[('text-align', 'center')])]).render()

    if not os.path.isdir('tmp'): os.makedirs('tmp')
    title = "Imbalanced_dataset_downsampled"
    fig_url = os.path.join('tmp', title + ".jpg")

    plt.ioff()
    fig = plt.figure()
    ax = plt.gca()
    ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
    df['count'].plot(kind='bar', figsize=(6, 3), ax=ax)
    #plt.show()
    plt.savefig(fig_url, bbox_inches='tight')
    plt.close(fig)

    # use base64 encoding to circumvent image (external file) caching (webbrowser)
    with open(fig_url, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    encoded_string = "data:image/jpeg;charset=utf-8;base64, " + encoded_string.decode(
    )
    os.remove(fig_url)

    two_columns_display(df_html, '<img src="' + encoded_string + '" />', .5)
    def list_validation(
        self, val_rule: str, manifest_col: pd.core.series.Series
    ) -> (List[List[str]], List[List[str]], pd.core.series.Series):
        """
        Purpose:
            Determine if values for a particular attribute are comma separated.
        Input:
            - val_rule: str, Validation rule
            - manifest_col: pd.core.series.Series, column for a given attribute
        Returns:
            - manifest_col: Input values in manifest arere-formatted to a list
            - Error log, error list
        """

        # For each 'list' (input as a string with a , delimiter) entered,
        # convert to a real list of strings, with leading and trailing
        # white spaces removed.
        errors = []
        warnings = []
        manifest_col = manifest_col.astype(str)
        csv_re = comma_separated_list_regex()

        rule_parts = val_rule.lower().split(" ")
        if len(rule_parts) > 1:
            list_robustness = rule_parts[1]
        else:
            list_robustness = 'strict'

        if list_robustness == 'strict':
            # This will capture any if an entry is not formatted properly. Only for strict lists
            for i, list_string in enumerate(manifest_col):
                if not re.fullmatch(csv_re, list_string):
                    list_error = "not_comma_delimited"
                    errors.append(
                        GenerateError.generate_list_error(
                            list_string,
                            row_num=str(i + 2),
                            attribute_name=manifest_col.name,
                            list_error=list_error,
                            invalid_entry=manifest_col[i]))

        # Convert string to list.
        manifest_col = parse_str_series_to_list(manifest_col)

        return errors, warnings, manifest_col
Exemplo n.º 16
0
def create_new_parameter_file(parameter_file: dict,
                              row: pd.core.series.Series) -> dict:
    """
    Create a new parameter file from the old one, changing the data
    to be the stuff from `row`, which is a pandas dataframe row.
    """

    new_parameter_file = parameter_file.copy()

    for parameter, value in row.iteritems():
        if parameter in ["Run ID", "Comment"]:
            continue

        section, param = parameter.split(":")
        new_parameter_file[section][param] = value

    new_parameter_file["MetaData"]["run_name"] = row["Run ID"]

    return new_parameter_file
Exemplo n.º 17
0
def time_arc_plot(start_dates: pd.core.series.Series,
                  end_dates: Iterable[datetime.datetime],
                  width: int = 500,
                  height: int = 300) -> bokeh.plotting.Figure:
    radius = (end_dates - start_dates) / 2
    x = start_dates + radius
    p = figure(x_axis_label="Date",
               x_axis_type='datetime',
               y_range=(0, radius.max()),
               x_range=(start_dates.min(), end_dates.max()),
               width=width,
               height=height,
               tools="")
    p.yaxis.major_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.yaxis.major_label_text_font_size = '0pt'
    p.toolbar.logo = None
    p.toolbar_location = None
    p.arc(x=x, y=0, radius=radius, start_angle=0, end_angle=np.pi)
    return p
Exemplo n.º 18
0
def _row_to_cte(idx: int, row: pd.core.series.Series, schema_sql: dict) -> str:
    """
    Convert a DataFrame row to its string equivalent for a cte.

    Args:
        idx: row number, used to define if it's a first line
        row: DataFrame row object
        schema_sql: column name-sql type as key-value pairs
            e.g. {'uid': 'STRING', 'clicks': 'INTEGER'}

    Returns:
        line_cte: row in sql cte format as str
    """
    line = ", ".join(
        [_field_to_cte(idx, r, schema_sql[r[0]]) for r in row.iteritems()])
    if idx == 0:
        line_cte = f"\tSELECT {line}"
    else:
        line_cte = f"\tUNION ALL\n\tSELECT {line}"

    return line_cte
Exemplo n.º 19
0
def compute_metrics_for_numeric_outcome(
    actuals: pd.core.series.Series,
    predictions: np.ndarray,
    weights: Union[None, np.ndarray] = None,
) -> OrderedDict:
    """Evaluate predicted numeric values against actual outcome values.

    Args:
        actuals: A Series representing actual outcome values.
        predictions: A Series of predictions for the respective outcome values.

    Returns:
        An ordered dictionary containing a key-value pair for R-squared.
    """
    metrics = OrderedDict()
    non_null_obs = actuals.notnull()
    metrics["R-squared"] = r2_score(
        actuals[non_null_obs],
        predictions[non_null_obs],
        sample_weight=weights[non_null_obs] if weights else None,
    )
    return metrics
Exemplo n.º 20
0
    def __correct_with_noise(self,
                             target: pd.core.series.Series,
                             inplace: bool = False,
                             seed: Optional[int] = False) -> np.ndarray:
        if not inplace:
            target = target.copy()
        group = target.name[0]
        target = target.values
        gmm = self.gmms[group]
        if not self.__check_should_correct(gmm):
            return target
        label = gmm.predict(target)
        selector = (label == 0)
        rng = get_random_state(seed)
        noise = rng.normal(0, gmm.get_stds()[0], selector.sum())
        target[selector] = np.log2(target[selector] + 1)
        target[selector] += noise
        target[selector] = 2**target[selector] - 1
        selector = target < 0
        target[selector] = 0

        return target
Exemplo n.º 21
0
def hybrid_interpolator(
    data: pd.core.series.Series,
    mean: float = None,
    limit: float = None,
    methods: List[str] = ["linear", "spline"],
    weights: List[float] = [0.65, 0.35],
    direction: str = "forward",
    order: int = 2,
) -> pd.core.series.Series:
    """
    Return a pandas.core.series.Series instance resulting of the weighted average
    of two interpolation methods.

    Model:
        φ = β1*method1 + β2*method2

    Default:
        β1, β2 = 0.6, 0.4
        method1, method2 = linear, spline

    Weights are meant to be numbers from the interval (0, 1)
    which add up to one, to keep the weighted sum consistent.

    limit_direction : {‘forward’, ‘backward’, ‘both’}, default ‘forward’
    If limit is specified, consecutive NaNs will be filled in this direction.

    If the predicted φ_i value is outside of the the interval
    ( (mean - limit), (mean + limit) )
    it will be replaced by the linear interpolation approximation.

    If not set, mean and limit will default to:
        mean = data.mean()
        limit = 2 * data.std()

    This function should have support for keyword arguments, but is yet to be implemented.
    """
    predictions: List[float] = []

    if not np.isclose(sum(weight for weight in weights), 1):
        raise Exception("Sum of weights must be equal to one!")

    for met in methods:
        if (met == "spline") or (met == "polynomial"):
            predictions.append(
                data.interpolate(method=met,
                                 order=order,
                                 limit_direction=direction))
        else:
            predictions.append(
                data.interpolate(method=met, limit_direction=direction))

    linear: pd.core.series.Series = predictions[0]
    spline: pd.core.series.Series = predictions[1]
    hybrid: pd.core.series.Series = (weights[0] * predictions[0] +
                                     weights[1] * predictions[1])

    corrected: pd.core.series.Series = copy.deepcopy(hybrid)

    if not mean:
        mean = data.mean()
    if not limit:
        limit = 2 * data.std()

    for idx, val in zip(hybrid[np.isnan(data)].index, hybrid[np.isnan(data)]):
        if (val > mean + limit) or (val < mean - limit):
            corrected[idx] = linear[idx]

    # df = copy.deepcopy(interpolated)
    # print(df.isnull().astype(int).groupby(df.notnull().astype(int).cumsum()).sum())

    return corrected
    def cross_validation(
        self,
        val_rule: str,
        manifest_col: pd.core.series.Series,
        project_scope: List,
    ) -> List[List[str]]:
        """
        Purpose:
            Do cross validation between the current manifest and all other manifests a user has access to on Synapse.
            Check if values in this manifest are present fully in others.
        Input:
            - val_rule: str, Validation rule
            - manifest_col: pd.core.series.Series, column for a given
                attribute in the manifest
        Output:
            This function will return errors when values in the current manifest's attribute 
            are not fully present in the correct amount of other manifests.
        """
        errors = []
        warnings = []
        missing_values = {}
        missing_manifest_log = {}
        present_manifest_log = []
        target_column = pd.Series(dtype=object)
        #parse sources and targets
        source_attribute = manifest_col.name
        [target_component,
         target_attribute] = val_rule.lower().split(" ")[1].split(".")
        scope = val_rule.lower().split(" ")[2]
        target_column.name = target_attribute

        #Get IDs of manifests with target component
        synStore, target_manifest_IDs, target_dataset_IDs = ValidateAttribute.get_target_manifests(
            target_component, project_scope)

        #Read each manifest
        for target_manifest_ID, target_dataset_ID in zip(
                target_manifest_IDs, target_dataset_IDs):
            entity = synStore.getDatasetManifest(datasetId=target_dataset_ID,
                                                 downloadFile=True)
            target_manifest = pd.read_csv(entity.path)

            #convert manifest column names into validation rule input format -
            column_names = {}
            for name in target_manifest.columns:
                column_names[name.replace(" ", "").lower()] = name

            if scope.__contains__('set'):
                #If the manifest has the target attribute for the component do the cross validation
                if target_attribute in column_names:
                    target_column = target_manifest[
                        column_names[target_attribute]]

                    #Do the validation on both columns
                    missing_values = manifest_col[~manifest_col.
                                                  isin(target_column)]

                    if missing_values.empty:
                        present_manifest_log.append(target_manifest_ID)
                    else:
                        missing_manifest_log[
                            target_manifest_ID] = missing_values

            elif scope.__contains__('value'):
                if target_attribute in column_names:
                    target_manifest.rename(columns={
                        column_names[target_attribute]:
                        target_attribute
                    },
                                           inplace=True)

                    target_column = pd.concat(
                        objs=[
                            target_column, target_manifest[target_attribute]
                        ],
                        join='outer',
                        ignore_index=True,
                    )
                    target_column = target_column.astype('object')
                    #print(target_column)

        missing_rows = []
        missing_values = []

        if scope.__contains__('value'):
            missing_values = manifest_col[~manifest_col.isin(target_column)]
            duplicated_values = manifest_col[manifest_col.isin(
                target_column[target_column.duplicated()])]

            if val_rule.__contains__(
                    'matchAtLeastOne') and not missing_values.empty:
                missing_rows = missing_values.index.to_numpy() + 2
                warnings.append(
                    GenerateError.generate_cross_warning(
                        val_rule=val_rule,
                        row_num=str(list(missing_rows)),
                        attribute_name=source_attribute,
                        invalid_entry=str(missing_values.values.tolist()),
                    ))
            elif val_rule.__contains__('matchExactlyOne') and (
                    duplicated_values.any() or missing_values.any()):
                invalid_values = pd.merge(duplicated_values,
                                          missing_values,
                                          how='outer')
                invalid_rows = pd.merge(duplicated_values,
                                        missing_values,
                                        how='outer',
                                        left_index=True,
                                        right_index=True).index.to_numpy() + 2
                warnings.append(
                    GenerateError.generate_cross_warning(
                        val_rule=val_rule,
                        row_num=str(list(invalid_rows)),
                        attribute_name=source_attribute,
                        invalid_entry=str(
                            invalid_values.squeeze().values.tolist())))

        #generate warnings if necessary
        elif scope.__contains__('set'):
            if val_rule.__contains__(
                    'matchAtLeastOne') and len(present_manifest_log) < 1:
                missing_entries = list(missing_manifest_log.values())
                missing_manifest_IDs = list(missing_manifest_log.keys())
                for missing_entry in missing_entries:
                    missing_rows.append(missing_entry.index[0] + 2)
                    missing_values.append(missing_entry.values[0])

                missing_rows = list(set(missing_rows))
                missing_values = list(set(missing_values))
                #print(missing_rows,missing_values)

                warnings.append(
                    GenerateError.generate_cross_warning(
                        val_rule=val_rule,
                        row_num=str(missing_rows),
                        attribute_name=source_attribute,
                        invalid_entry=str(missing_values),
                        missing_manifest_ID=missing_manifest_IDs,
                    ))
            elif val_rule.__contains__(
                    'matchExactlyOne') and len(present_manifest_log) != 1:
                warnings.append(
                    GenerateError.generate_cross_warning(
                        val_rule=val_rule,
                        attribute_name=source_attribute,
                        matching_manifests=present_manifest_log,
                    ))

        return errors, warnings
Exemplo n.º 23
0
 def sda(data: pd.core.series.Series):
     return sum(i for i in data.tail(7))/7
Exemplo n.º 24
0
def summarize_posterior_inferences(post_preds_means: pd.core.series.Series,
                                   post_data: pd.DataFrame,
                                   simulated_ys: Union[np.array, tf.Tensor],
                                   alpha: float = 0.05) -> pd.DataFrame:
    """
    After running the posterior inferences compilation, this function aggregates the
    results and gets the final interpretation for the Causal Impact results, such as
    the expected absolute impact of the given intervention and its confidence interval.

    Args
    ----
      post_preds_means: pd.core.series.Series
          Forecats means of post intervention data.
      post_data: pd.DataFrame
      simulated_ys: Union[np.array, tf.tensor]
          Array of simulated forecasts for response `y` extract from running mcmc samples
          from the posterior `P(z | y)`.
      alpha: float

    Returns
    -------
      summary: pd.DataFrame
          Summary data which is used in the `summary` functionality.
    """
    lower_percen, upper_percen = get_lower_upper_percentiles(alpha)
    # Compute the mean of metrics
    mean_post_y = post_data.iloc[:, 0].mean()
    mean_post_pred = post_preds_means.mean()
    mean_post_pred_lower, mean_post_pred_upper = np.percentile(
        simulated_ys.mean(axis=1), [lower_percen, upper_percen])
    # Compute the sum of metrics
    sum_post_y = post_data.iloc[:, 0].sum()
    sum_post_pred = post_preds_means.sum()
    sum_post_pred_lower, sum_post_pred_upper = np.percentile(
        simulated_ys.sum(axis=1), [lower_percen, upper_percen])
    # Causal Impact analysis metrics
    abs_effect = mean_post_y - mean_post_pred
    abs_effect_lower = mean_post_y - mean_post_pred_upper
    abs_effect_upper = mean_post_y - mean_post_pred_lower
    # Sum
    sum_abs_effect = sum_post_y - sum_post_pred
    sum_abs_effect_lower = sum_post_y - sum_post_pred_upper
    sum_abs_effect_upper = sum_post_y - sum_post_pred_lower
    # Relative
    rel_effect = abs_effect / mean_post_pred
    rel_effect_lower = abs_effect_lower / mean_post_pred
    rel_effect_upper = abs_effect_upper / mean_post_pred
    # Sum relative
    sum_rel_effect = sum_abs_effect / sum_post_pred
    sum_rel_effect_lower = sum_abs_effect_lower / sum_post_pred
    sum_rel_effect_upper = sum_abs_effect_upper / sum_post_pred

    summary = [[mean_post_y, sum_post_y], [mean_post_pred, sum_post_pred],
               [mean_post_pred_lower, sum_post_pred_lower],
               [mean_post_pred_upper, sum_post_pred_upper],
               [abs_effect, sum_abs_effect],
               [abs_effect_lower, sum_abs_effect_lower],
               [abs_effect_upper, sum_abs_effect_upper],
               [rel_effect, sum_rel_effect],
               [rel_effect_lower, sum_rel_effect_lower],
               [rel_effect_upper, sum_rel_effect_upper]]
    summary = pd.DataFrame(summary,
                           columns=['average', 'cumulative'],
                           index=[
                               'actual', 'predicted', 'predicted_lower',
                               'predicted_upper', 'abs_effect',
                               'abs_effect_lower', 'abs_effect_upper',
                               'rel_effect', 'rel_effect_lower',
                               'rel_effect_upper'
                           ])
    return summary
    def regex_validation(
        self, val_rule: str, manifest_col: pd.core.series.Series
    ) -> (List[List[str]], List[List[str]]):
        """
        Purpose:
            Check if values for a given manifest attribue conform to the reguar expression,
            provided in val_rule.
        Input:
            - val_rule: str, Validation rule
            - manifest_col: pd.core.series.Series, column for a given
                attribute in the manifest
            Using this module requres validation rules written in the following manner:
                'regex module regular expression'
                - regex: is an exact string specifying that the input is to be validated as a 
                regular expression.
                - module: is the name of the module within re to run ie. search. 
                - regular_expression: is the regular expression with which to validate
                the user input.
        Returns:
            - This function will return errors when the user input value
            does not match schema specifications.
            Logging.error.
            Errors: List[str] Error details for further storage.
        TODO: 
            move validation to convert step.
        """

        reg_exp_rules = val_rule.split(" ")

        try:
            module_to_call = getattr(re, reg_exp_rules[1])
            reg_expression = reg_exp_rules[2]
        except:
            raise ValidationError(
                f"The regex rules were not provided properly for attribute {manifest_col.name}."
                f" They should be provided as follows ['regex', 'module name', 'regular expression']"
            )

        errors = []
        warnings = []
        validation_rules = self.sg.se.get_class_validation_rules(
            self.sg.se.get_class_label_from_display_name(manifest_col.name))
        # Handle case where validating re's within a list.
        if re.search('list', "|".join(validation_rules)):
            if type(manifest_col[0]) == str:
                # Convert string to list.
                manifest_col = parse_str_series_to_list(manifest_col)

            for i, row_values in enumerate(manifest_col):
                for j, re_to_check in enumerate(row_values):
                    re_to_check = str(re_to_check)
                    if not bool(
                            module_to_call(reg_expression,
                                           re_to_check)) and bool(re_to_check):
                        errors.append(
                            GenerateError.generate_regex_error(
                                val_rule,
                                reg_expression,
                                row_num=str(i + 2),
                                module_to_call=reg_exp_rules[1],
                                attribute_name=manifest_col.name,
                                invalid_entry=manifest_col[i]))

        # Validating single re's
        else:
            manifest_col = manifest_col.astype(str)
            for i, re_to_check in enumerate(manifest_col):
                if not bool(module_to_call(reg_expression,
                                           re_to_check)) and bool(re_to_check):
                    errors.append(
                        GenerateError.generate_regex_error(
                            val_rule,
                            reg_expression,
                            row_num=str(i + 2),
                            module_to_call=reg_exp_rules[1],
                            attribute_name=manifest_col.name,
                            invalid_entry=manifest_col[i]))

        return errors, warnings