def normalize_numeric_feature( col: pd.core.series.Series, excluded_obs: Union[None, pd.core.series.Series] = None ) -> Tuple[pd.core.series.Series, List[float]]: """Scale numeric values to their empirical range. Args: col: A numeric Series. excluded_obs: True for observations to exclude when computing min/max. Returns: - A Series of floats. - A list containing the minimum and maximum values among the included observations. """ if excluded_obs is None: col_subset = col[col.notnull()] else: col_subset = col[(~excluded_obs) & col.notnull()] if len(col_subset) > 0: minimum = np.nanmin(col_subset) maximum = np.nanmax(col_subset) else: minimum = np.nan maximum = np.nan normalized_col = process_numeric_feature(col, minimum, maximum) numeric_range = [minimum, maximum] return normalized_col, numeric_range
def plot_stats(df: pd.core.series.Series, figsize: tuple, title: str, xlabel: str, ylabel: str, output_path: str, kind: str = 'bar') -> None: fig, ax = plt.subplots(figsize=figsize) if kind == 'bar': df.plot( kind=kind, rot=90, color='g', ec='black', alpha=0.5, ) else: df.plot( kind=kind, color='g', alpha=0.5, ) plt.title(title, size=14) plt.xlabel(xlabel, size=10) plt.ylabel(ylabel, size=10) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) fig.tight_layout() plt.savefig(output_path, format='png')
def dist_plot(series: pd.core.series.Series, dropna: bool = True, sig: Optional[int] = None) -> NoReturn: """ Given a pandas Series, generate a descriptive visualisation with a boxplot and a histogram with a kde. By default, this function drops `nan` values. If you desire to handle them differently, you should do so beforehand and/or specify dropna=False. """ if dropna: series = series.dropna() sig = sig or 0 quarts = scipy.stats.mstats.mquantiles(series, [0.001, 0.25, 0.5, 0.75, 0.975]) f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.25, .75)}) sns.boxplot(series, ax=ax_box) sns.stripplot(series, color="orange", jitter=0.2, size=2.5, ax=ax_box) sns.distplot(series, ax=ax_hist, kde=True) ax_hist.axvline(series.mean()) ax_hist.set_xticks(quarts) #ax_box.set(xlabel=f'Mean value : {int(series.mean())}') plt.title( f"Glycaemic Distribution μ = {round(series.mean(), sig)}, σ = {round(series.std(), sig)}" )
def new_hybrid_interpolator( data: pd.core.series.Series, methods: Dict[str, float] = { "linear": 0.65, "spline": 0.35 }, direction: str = "forward", limit: int = 120, limit_area: Optional[str] = None, order: int = 2, **kw, ) -> pd.core.series.Series: """""" limit_area = limit_area or "inside" weight_sum = sum(weight for weight in methods.values()) if not np.isclose(weight_sum, 1): raise Exception(f"Sum of weights {weight_sum} != 1") resampled: Dict[str, pd.core.series.Series] = {} for key, weight in methods.items(): resampled.update({ key: weight * data.interpolate( method=key, order=order, limit_area=limit_area, limit=limit) }) return reduce(lambda x, y: x + y, resampled.values())
def process_categorical_feature(col: pd.core.series.Series, cat_map: dict) -> pd.core.frame.DataFrame: """Map categorical values to unsigned integers. Args: col: A pandas Series. cat_map: A dict containing a key for each unique value in col. Returns: A pandas Series of unsigned integers. """ col = col.map(cat_map).fillna(0) n_bits = 8 while col.max() >= 2**n_bits: n_bits *= 2 col = col.astype("uint" + str(n_bits)) return col
def compute_metrics_for_binary_outcome( actuals: pd.core.series.Series, predictions: pd.core.series.Series, threshold_positive: Union[None, float] = 0.5, share_positive: Union[None, float] = None) -> OrderedDict: """Evaluate predicted probabilities against actual binary outcome values. Args: actuals: A Series representing actual Boolean outcome values. predictions: A Series of predicted probabilities of the respective outcome values. threshold_positive: None or a value in [0, 1] representing the minimum predicted probability considered to be a positive prediction; overridden by share_positive. share_positive: None or a value in [0, 1] representing the share of observations with the highest predicted probabilities considered to have positive predictions; probability ties may cause share of positives in output to exceed given value; overrides threshold_positive. Returns: An ordered dictionary containing key-value pairs for area under the receiver operating characteristic curve (AUROC), predicted and actual shares of observations with an outcome of True, and all elements of the confusion matrix. """ metrics = OrderedDict() if actuals.any() and not actuals.all(): metrics['AUROC'] = roc_auc_score(actuals, predictions) else: metrics['AUROC'] = np.nan metrics['Predicted Share'] = predictions.mean() metrics['Actual Share'] = actuals.mean() if actuals.empty: metrics['True Positives'], metrics['False Negatives'], \ metrics['False Positives'], metrics['True Negatives'] = \ [0, 0, 0, 0] else: if share_positive is not None: threshold_positive = np.quantile(predictions, 1 - share_positive) metrics['True Positives'], metrics['False Negatives'], \ metrics['False Positives'], metrics['True Negatives'] = \ confusion_matrix(actuals, predictions >= threshold_positive, labels=[True, False]).ravel().tolist() return metrics
def clean_simple(cls, text_series: pd.core.series.Series, stopwords: List): """ Clean each text : - puts each word in lower case, - does not keep words belonging to the stopwords list, - does not keep words with less than 2 characters, - keeps only letters and deletes numbers and special characters. """ # create a list of words text_series = text_series.map(lambda text: word_tokenize(text)) # puts each word in lower case, does not keep words belonging to the stopwords list, does not keep words with less than 2 characters, # keeps only letters and deletes numbers and special characters. text_series = text_series.map(lambda tok: [ word.lower() for word in re.split( " ", re.sub(r"(\W+|_|\d+)", " ", " ".join(tok))) if word.lower() not in stopwords and len(word) > 1 ]) return text_series
def transform_pctchange_clean_standardize_pd( x_all_raw: pd.core.series.Series) -> pd.core.series.Series: ''' Stationarize -> Clean -> Standardize ''' # 0/ Stationarize x_all = x_all_raw.pct_change() # 1/ Clean x_all = clean_nan_inf_pd(x_all) # 2/ Standardize method 0/ x_all = (x_all - x_all.mean()) / x_all.std() return x_all
def extract_selection_number(c12n_row: pd.core.series.Series, column_name: str ): # eg, "data/sequences/M256/M/01971.seq" # ^^^^^ col_value = c12n_row.get(column_name) match = re.search(r'([^/]+)\.seq$', col_value) if match: return int(match.group(1)) else: return -1
def process_categorical_feature(col: pd.core.series.Series, cat_map: dict) -> pd.core.frame.DataFrame: """Map categorical values to unsigned integers. Args: col: A pandas Series. cat_map: A dict containing a key for each unique value in col. Returns: A pandas Series of unsigned integers. """ warn( "Warning: process_categorical_feature is deprecated. " "Processors now convert categorical features to pandas Categorical type.", DeprecationWarning, ) col = col.map(cat_map).fillna(0) n_bits = 8 while col.max() >= 2**n_bits: n_bits *= 2 col = col.astype("uint" + str(n_bits)) return col
def transform_pctchange_clean_standardize_sample_pd( x_all_raw: pd.core.series.Series, y_all_raw: pd.core.series.Series, idx_per_beg, idx_per_end) -> pd.core.series.Series: # Must be done in this order: #x_all = df_pair[sym_0_col].pct_change() #y_all = df_pair[sym_1_col].pct_change() #x_per = df_pair.loc[dtrange[0]:dtrange[1],sym_0_col].pct_change()#[1:] #y_per = df_pair.loc[dtrange[0]:dtrange[1],sym_1_col].pct_change()#[1:] # 0/ Stationarize x_all = x_all_raw.pct_change() y_all = y_all_raw.pct_change() # 1/ Clean #x_per, y_per = clean_nan_inf(x_per, y_per) x_all = clean_nan_inf_pd(x_all) y_all = clean_nan_inf_pd(y_all) # 2/ Standardize method 0/ x_per = x_all.loc[idx_per_beg:idx_per_end] y_per = y_all.loc[idx_per_beg:idx_per_end] x_all = (x_all - x_all.mean()) / x_all.std() y_all = (y_all - y_all.mean()) / y_all.std() x_per = (x_per - x_per.mean()) / x_per.std() y_per = (y_per - y_per.mean()) / y_per.std() """ # Standardize method 1/ x_all = (x_all - x_all.mean())/x_all.std() y_all = (y_all - y_all.mean())/y_all.std() x_per = x_all.loc[idx_per_beg:idx_per_end] y_per = y_all.loc[idx_per_beg:idx_per_end] """ return x_all, y_all, x_per, y_per
def __correct_with_filter( self, target: pd.core.series.Series, inplace: bool = False, ) -> np.ndarray: if not inplace: target = target.copy() group = target.name[0] target = target.values if not self.__check_should_correct(self.gmms[group]): return target label = self.gmms[group].predict(target) selector = (label == 0) target[selector] = 0 return target
def dist_plot(series: pd.core.series.Series, dropna: bool = True) -> NoReturn: """ Given a pandas Series, generate a descriptive visualisation with a boxplot and a histogram with a kde. By default, this function drops `nan` values. If you desire to handle them differently, you should do so beforehand and/or specify dropna=False. """ if dropna: series = series.dropna() f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.25, .75)}) sns.boxplot(series, ax=ax_box) sns.stripplot(series, color="orange", jitter=0.2, size=2.5, ax=ax_box) sns.distplot(series, ax=ax_hist, kde=True) ax_box.set(xlabel='')
def two_columns_display__classes_distribution( dataset_labels: pd.core.series.Series) -> None: """ Displays (in an IPython cell output) 2 columns : - on the left, a table of classes distribution (incl. percentage column) - on the right, a barchart plot of classes distribution Parameters : - dataset_labels (pd.core.series.Series) : a collection of dataset records labels. """ df = pd.DataFrame(dataset_labels.copy()) df.reset_index(level=0, inplace=True) # index-to-column df = df.groupby('sentiment').count() \ .rename(columns = {'index':'count'}) df['percentage'] = (df / len(dataset_labels) * 100)['count'] df_html = df.reset_index(level=0).style.hide_index() \ .set_properties(**{'width':'20em', 'text-align':'center'}) \ .format({"count": "{:,}", "percentage": "{0:.2f}%"}) \ .set_table_styles([dict(selector="th", props=[('text-align', 'center')])]).render() if not os.path.isdir('tmp'): os.makedirs('tmp') title = "Imbalanced_dataset_downsampled" fig_url = os.path.join('tmp', title + ".jpg") plt.ioff() fig = plt.figure() ax = plt.gca() ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) df['count'].plot(kind='bar', figsize=(6, 3), ax=ax) #plt.show() plt.savefig(fig_url, bbox_inches='tight') plt.close(fig) # use base64 encoding to circumvent image (external file) caching (webbrowser) with open(fig_url, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) encoded_string = "data:image/jpeg;charset=utf-8;base64, " + encoded_string.decode( ) os.remove(fig_url) two_columns_display(df_html, '<img src="' + encoded_string + '" />', .5)
def list_validation( self, val_rule: str, manifest_col: pd.core.series.Series ) -> (List[List[str]], List[List[str]], pd.core.series.Series): """ Purpose: Determine if values for a particular attribute are comma separated. Input: - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute Returns: - manifest_col: Input values in manifest arere-formatted to a list - Error log, error list """ # For each 'list' (input as a string with a , delimiter) entered, # convert to a real list of strings, with leading and trailing # white spaces removed. errors = [] warnings = [] manifest_col = manifest_col.astype(str) csv_re = comma_separated_list_regex() rule_parts = val_rule.lower().split(" ") if len(rule_parts) > 1: list_robustness = rule_parts[1] else: list_robustness = 'strict' if list_robustness == 'strict': # This will capture any if an entry is not formatted properly. Only for strict lists for i, list_string in enumerate(manifest_col): if not re.fullmatch(csv_re, list_string): list_error = "not_comma_delimited" errors.append( GenerateError.generate_list_error( list_string, row_num=str(i + 2), attribute_name=manifest_col.name, list_error=list_error, invalid_entry=manifest_col[i])) # Convert string to list. manifest_col = parse_str_series_to_list(manifest_col) return errors, warnings, manifest_col
def create_new_parameter_file(parameter_file: dict, row: pd.core.series.Series) -> dict: """ Create a new parameter file from the old one, changing the data to be the stuff from `row`, which is a pandas dataframe row. """ new_parameter_file = parameter_file.copy() for parameter, value in row.iteritems(): if parameter in ["Run ID", "Comment"]: continue section, param = parameter.split(":") new_parameter_file[section][param] = value new_parameter_file["MetaData"]["run_name"] = row["Run ID"] return new_parameter_file
def time_arc_plot(start_dates: pd.core.series.Series, end_dates: Iterable[datetime.datetime], width: int = 500, height: int = 300) -> bokeh.plotting.Figure: radius = (end_dates - start_dates) / 2 x = start_dates + radius p = figure(x_axis_label="Date", x_axis_type='datetime', y_range=(0, radius.max()), x_range=(start_dates.min(), end_dates.max()), width=width, height=height, tools="") p.yaxis.major_tick_line_color = None p.yaxis.minor_tick_line_color = None p.yaxis.major_label_text_font_size = '0pt' p.toolbar.logo = None p.toolbar_location = None p.arc(x=x, y=0, radius=radius, start_angle=0, end_angle=np.pi) return p
def _row_to_cte(idx: int, row: pd.core.series.Series, schema_sql: dict) -> str: """ Convert a DataFrame row to its string equivalent for a cte. Args: idx: row number, used to define if it's a first line row: DataFrame row object schema_sql: column name-sql type as key-value pairs e.g. {'uid': 'STRING', 'clicks': 'INTEGER'} Returns: line_cte: row in sql cte format as str """ line = ", ".join( [_field_to_cte(idx, r, schema_sql[r[0]]) for r in row.iteritems()]) if idx == 0: line_cte = f"\tSELECT {line}" else: line_cte = f"\tUNION ALL\n\tSELECT {line}" return line_cte
def compute_metrics_for_numeric_outcome( actuals: pd.core.series.Series, predictions: np.ndarray, weights: Union[None, np.ndarray] = None, ) -> OrderedDict: """Evaluate predicted numeric values against actual outcome values. Args: actuals: A Series representing actual outcome values. predictions: A Series of predictions for the respective outcome values. Returns: An ordered dictionary containing a key-value pair for R-squared. """ metrics = OrderedDict() non_null_obs = actuals.notnull() metrics["R-squared"] = r2_score( actuals[non_null_obs], predictions[non_null_obs], sample_weight=weights[non_null_obs] if weights else None, ) return metrics
def __correct_with_noise(self, target: pd.core.series.Series, inplace: bool = False, seed: Optional[int] = False) -> np.ndarray: if not inplace: target = target.copy() group = target.name[0] target = target.values gmm = self.gmms[group] if not self.__check_should_correct(gmm): return target label = gmm.predict(target) selector = (label == 0) rng = get_random_state(seed) noise = rng.normal(0, gmm.get_stds()[0], selector.sum()) target[selector] = np.log2(target[selector] + 1) target[selector] += noise target[selector] = 2**target[selector] - 1 selector = target < 0 target[selector] = 0 return target
def hybrid_interpolator( data: pd.core.series.Series, mean: float = None, limit: float = None, methods: List[str] = ["linear", "spline"], weights: List[float] = [0.65, 0.35], direction: str = "forward", order: int = 2, ) -> pd.core.series.Series: """ Return a pandas.core.series.Series instance resulting of the weighted average of two interpolation methods. Model: φ = β1*method1 + β2*method2 Default: β1, β2 = 0.6, 0.4 method1, method2 = linear, spline Weights are meant to be numbers from the interval (0, 1) which add up to one, to keep the weighted sum consistent. limit_direction : {‘forward’, ‘backward’, ‘both’}, default ‘forward’ If limit is specified, consecutive NaNs will be filled in this direction. If the predicted φ_i value is outside of the the interval ( (mean - limit), (mean + limit) ) it will be replaced by the linear interpolation approximation. If not set, mean and limit will default to: mean = data.mean() limit = 2 * data.std() This function should have support for keyword arguments, but is yet to be implemented. """ predictions: List[float] = [] if not np.isclose(sum(weight for weight in weights), 1): raise Exception("Sum of weights must be equal to one!") for met in methods: if (met == "spline") or (met == "polynomial"): predictions.append( data.interpolate(method=met, order=order, limit_direction=direction)) else: predictions.append( data.interpolate(method=met, limit_direction=direction)) linear: pd.core.series.Series = predictions[0] spline: pd.core.series.Series = predictions[1] hybrid: pd.core.series.Series = (weights[0] * predictions[0] + weights[1] * predictions[1]) corrected: pd.core.series.Series = copy.deepcopy(hybrid) if not mean: mean = data.mean() if not limit: limit = 2 * data.std() for idx, val in zip(hybrid[np.isnan(data)].index, hybrid[np.isnan(data)]): if (val > mean + limit) or (val < mean - limit): corrected[idx] = linear[idx] # df = copy.deepcopy(interpolated) # print(df.isnull().astype(int).groupby(df.notnull().astype(int).cumsum()).sum()) return corrected
def cross_validation( self, val_rule: str, manifest_col: pd.core.series.Series, project_scope: List, ) -> List[List[str]]: """ Purpose: Do cross validation between the current manifest and all other manifests a user has access to on Synapse. Check if values in this manifest are present fully in others. Input: - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute in the manifest Output: This function will return errors when values in the current manifest's attribute are not fully present in the correct amount of other manifests. """ errors = [] warnings = [] missing_values = {} missing_manifest_log = {} present_manifest_log = [] target_column = pd.Series(dtype=object) #parse sources and targets source_attribute = manifest_col.name [target_component, target_attribute] = val_rule.lower().split(" ")[1].split(".") scope = val_rule.lower().split(" ")[2] target_column.name = target_attribute #Get IDs of manifests with target component synStore, target_manifest_IDs, target_dataset_IDs = ValidateAttribute.get_target_manifests( target_component, project_scope) #Read each manifest for target_manifest_ID, target_dataset_ID in zip( target_manifest_IDs, target_dataset_IDs): entity = synStore.getDatasetManifest(datasetId=target_dataset_ID, downloadFile=True) target_manifest = pd.read_csv(entity.path) #convert manifest column names into validation rule input format - column_names = {} for name in target_manifest.columns: column_names[name.replace(" ", "").lower()] = name if scope.__contains__('set'): #If the manifest has the target attribute for the component do the cross validation if target_attribute in column_names: target_column = target_manifest[ column_names[target_attribute]] #Do the validation on both columns missing_values = manifest_col[~manifest_col. isin(target_column)] if missing_values.empty: present_manifest_log.append(target_manifest_ID) else: missing_manifest_log[ target_manifest_ID] = missing_values elif scope.__contains__('value'): if target_attribute in column_names: target_manifest.rename(columns={ column_names[target_attribute]: target_attribute }, inplace=True) target_column = pd.concat( objs=[ target_column, target_manifest[target_attribute] ], join='outer', ignore_index=True, ) target_column = target_column.astype('object') #print(target_column) missing_rows = [] missing_values = [] if scope.__contains__('value'): missing_values = manifest_col[~manifest_col.isin(target_column)] duplicated_values = manifest_col[manifest_col.isin( target_column[target_column.duplicated()])] if val_rule.__contains__( 'matchAtLeastOne') and not missing_values.empty: missing_rows = missing_values.index.to_numpy() + 2 warnings.append( GenerateError.generate_cross_warning( val_rule=val_rule, row_num=str(list(missing_rows)), attribute_name=source_attribute, invalid_entry=str(missing_values.values.tolist()), )) elif val_rule.__contains__('matchExactlyOne') and ( duplicated_values.any() or missing_values.any()): invalid_values = pd.merge(duplicated_values, missing_values, how='outer') invalid_rows = pd.merge(duplicated_values, missing_values, how='outer', left_index=True, right_index=True).index.to_numpy() + 2 warnings.append( GenerateError.generate_cross_warning( val_rule=val_rule, row_num=str(list(invalid_rows)), attribute_name=source_attribute, invalid_entry=str( invalid_values.squeeze().values.tolist()))) #generate warnings if necessary elif scope.__contains__('set'): if val_rule.__contains__( 'matchAtLeastOne') and len(present_manifest_log) < 1: missing_entries = list(missing_manifest_log.values()) missing_manifest_IDs = list(missing_manifest_log.keys()) for missing_entry in missing_entries: missing_rows.append(missing_entry.index[0] + 2) missing_values.append(missing_entry.values[0]) missing_rows = list(set(missing_rows)) missing_values = list(set(missing_values)) #print(missing_rows,missing_values) warnings.append( GenerateError.generate_cross_warning( val_rule=val_rule, row_num=str(missing_rows), attribute_name=source_attribute, invalid_entry=str(missing_values), missing_manifest_ID=missing_manifest_IDs, )) elif val_rule.__contains__( 'matchExactlyOne') and len(present_manifest_log) != 1: warnings.append( GenerateError.generate_cross_warning( val_rule=val_rule, attribute_name=source_attribute, matching_manifests=present_manifest_log, )) return errors, warnings
def sda(data: pd.core.series.Series): return sum(i for i in data.tail(7))/7
def summarize_posterior_inferences(post_preds_means: pd.core.series.Series, post_data: pd.DataFrame, simulated_ys: Union[np.array, tf.Tensor], alpha: float = 0.05) -> pd.DataFrame: """ After running the posterior inferences compilation, this function aggregates the results and gets the final interpretation for the Causal Impact results, such as the expected absolute impact of the given intervention and its confidence interval. Args ---- post_preds_means: pd.core.series.Series Forecats means of post intervention data. post_data: pd.DataFrame simulated_ys: Union[np.array, tf.tensor] Array of simulated forecasts for response `y` extract from running mcmc samples from the posterior `P(z | y)`. alpha: float Returns ------- summary: pd.DataFrame Summary data which is used in the `summary` functionality. """ lower_percen, upper_percen = get_lower_upper_percentiles(alpha) # Compute the mean of metrics mean_post_y = post_data.iloc[:, 0].mean() mean_post_pred = post_preds_means.mean() mean_post_pred_lower, mean_post_pred_upper = np.percentile( simulated_ys.mean(axis=1), [lower_percen, upper_percen]) # Compute the sum of metrics sum_post_y = post_data.iloc[:, 0].sum() sum_post_pred = post_preds_means.sum() sum_post_pred_lower, sum_post_pred_upper = np.percentile( simulated_ys.sum(axis=1), [lower_percen, upper_percen]) # Causal Impact analysis metrics abs_effect = mean_post_y - mean_post_pred abs_effect_lower = mean_post_y - mean_post_pred_upper abs_effect_upper = mean_post_y - mean_post_pred_lower # Sum sum_abs_effect = sum_post_y - sum_post_pred sum_abs_effect_lower = sum_post_y - sum_post_pred_upper sum_abs_effect_upper = sum_post_y - sum_post_pred_lower # Relative rel_effect = abs_effect / mean_post_pred rel_effect_lower = abs_effect_lower / mean_post_pred rel_effect_upper = abs_effect_upper / mean_post_pred # Sum relative sum_rel_effect = sum_abs_effect / sum_post_pred sum_rel_effect_lower = sum_abs_effect_lower / sum_post_pred sum_rel_effect_upper = sum_abs_effect_upper / sum_post_pred summary = [[mean_post_y, sum_post_y], [mean_post_pred, sum_post_pred], [mean_post_pred_lower, sum_post_pred_lower], [mean_post_pred_upper, sum_post_pred_upper], [abs_effect, sum_abs_effect], [abs_effect_lower, sum_abs_effect_lower], [abs_effect_upper, sum_abs_effect_upper], [rel_effect, sum_rel_effect], [rel_effect_lower, sum_rel_effect_lower], [rel_effect_upper, sum_rel_effect_upper]] summary = pd.DataFrame(summary, columns=['average', 'cumulative'], index=[ 'actual', 'predicted', 'predicted_lower', 'predicted_upper', 'abs_effect', 'abs_effect_lower', 'abs_effect_upper', 'rel_effect', 'rel_effect_lower', 'rel_effect_upper' ]) return summary
def regex_validation( self, val_rule: str, manifest_col: pd.core.series.Series ) -> (List[List[str]], List[List[str]]): """ Purpose: Check if values for a given manifest attribue conform to the reguar expression, provided in val_rule. Input: - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute in the manifest Using this module requres validation rules written in the following manner: 'regex module regular expression' - regex: is an exact string specifying that the input is to be validated as a regular expression. - module: is the name of the module within re to run ie. search. - regular_expression: is the regular expression with which to validate the user input. Returns: - This function will return errors when the user input value does not match schema specifications. Logging.error. Errors: List[str] Error details for further storage. TODO: move validation to convert step. """ reg_exp_rules = val_rule.split(" ") try: module_to_call = getattr(re, reg_exp_rules[1]) reg_expression = reg_exp_rules[2] except: raise ValidationError( f"The regex rules were not provided properly for attribute {manifest_col.name}." f" They should be provided as follows ['regex', 'module name', 'regular expression']" ) errors = [] warnings = [] validation_rules = self.sg.se.get_class_validation_rules( self.sg.se.get_class_label_from_display_name(manifest_col.name)) # Handle case where validating re's within a list. if re.search('list', "|".join(validation_rules)): if type(manifest_col[0]) == str: # Convert string to list. manifest_col = parse_str_series_to_list(manifest_col) for i, row_values in enumerate(manifest_col): for j, re_to_check in enumerate(row_values): re_to_check = str(re_to_check) if not bool( module_to_call(reg_expression, re_to_check)) and bool(re_to_check): errors.append( GenerateError.generate_regex_error( val_rule, reg_expression, row_num=str(i + 2), module_to_call=reg_exp_rules[1], attribute_name=manifest_col.name, invalid_entry=manifest_col[i])) # Validating single re's else: manifest_col = manifest_col.astype(str) for i, re_to_check in enumerate(manifest_col): if not bool(module_to_call(reg_expression, re_to_check)) and bool(re_to_check): errors.append( GenerateError.generate_regex_error( val_rule, reg_expression, row_num=str(i + 2), module_to_call=reg_exp_rules[1], attribute_name=manifest_col.name, invalid_entry=manifest_col[i])) return errors, warnings