def __init__(self, source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]], target_feature_name: str = None, compare: Union[pd.DataFrame, Tuple[pd.DataFrame, str]] = None, pairwise_analysis: str = 'auto', fc: FeatureConfig = None): pairwise_analysis = pairwise_analysis.lower() if pairwise_analysis not in ["on", "auto", "off"]: raise ValueError('"pairwise_analysis" parameter should be one of: "on", "auto", "off"') sv_html.load_layout_globals_from_config() self._jupyter_html = "" self._page_html = "" self._features = dict() self.compare_name = None self._target = None self.test_mode = False if fc is None: fc = FeatureConfig() # Associations: _associations[FEATURE][GIVES INFORMATION ABOUT THIS FEATURE] self._associations = dict() self._associations_compare = dict() self._association_graphs = dict() self._association_graphs_compare = dict() # Handle source and compare dataframes and names if type(source) == pd.DataFrame: source_df = source self.source_name = "DataFrame" elif type(source) == list or type(source) == tuple: if len(source) != 2: raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') source_df = source[0] self.source_name = source[1] else: raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') if len(su.get_duplicate_cols(source_df)) > 0: raise ValueError('Duplicate column names detected in "source"; this is not supported.') # NEW (12-14-2020): Rename indices that use the reserved name "index" # From pandas-profiling: # If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column} to be `df_index`. if 'index' in source_df.columns: source_df = source_df.rename(columns={"index": "df_index"}) if target_feature_name == 'index': target_feature_name = 'df_index' all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()] if compare is None: compare_df = None self.compare_name = None all_compare_names = list() elif type(compare) == pd.DataFrame: compare_df = compare if 'index' in compare_df.columns: compare_df = compare_df.rename(columns={"index": "df_index"}) self.compare_name = "Compared" all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()] elif type(compare) == list or type(compare) == tuple: if len(compare) != 2: raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') compare_df = compare[0] if 'index' in compare_df.columns: compare_df = compare_df.rename(columns={"index": "df_index"}) self.compare_name = compare[1] all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()] else: raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') # Validate some params if compare_df is not None and len(su.get_duplicate_cols(compare_df)) > 0: raise ValueError('Duplicate column names detected in "compare"; this is not supported.') if target_feature_name in fc.skip: raise ValueError(f'"{target_feature_name}" was also specified as "skip". Target cannot be skipped.') for key in fc.get_all_mentioned_features(): if key not in all_source_names: raise ValueError(f'"{key}" was specified in "feature_config" but is not found in source dataframe (watch case-sensitivity?).') # Find Features and Target (FILTER SKIPPED) filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems() if cur_name not in fc.skip] for skipped in fc.skip: if skipped not in all_source_names and skipped not in all_compare_names: raise ValueError(f'"{skipped}" was marked as "skip" but is not in any provided dataframe (watch case-sensitivity?).') # Progress bar setup ratio_progress_of_df_summary_vs_feature = 1.0 number_features = len(filtered_series_names_in_source) exponential_checks = number_features * number_features progress_chunks = ratio_progress_of_df_summary_vs_feature \ + number_features + (0 if target_feature_name is not None else 0) self.progress_bar = tqdm(total=progress_chunks, bar_format= \ '{desc:45}|{bar}| [{percentage:3.0f}%] {elapsed} -> ({remaining} left)', \ ascii=False, dynamic_ncols=True, position=0, leave= True) # Summarize dataframe self.progress_bar.set_description_str("[Summarizing dataframe]") self.summary_source = dict() self.summarize_dataframe(source_df, self.source_name, self.summary_source, fc.skip) # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!! # if target_feature_name: # self.summary_source["num_columns"] = self.summary_source["num_columns"] - 1 if compare_df is not None: self.summary_compare = dict() self.summarize_dataframe(compare_df, self.compare_name, self.summary_compare, fc.skip) cmp_not_in_src = \ [name for name in all_compare_names if name not in all_source_names] self.summary_compare["num_cmp_not_in_source"] = len(cmp_not_in_src) # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!! # if target_feature_name: # if target_feature_name in compare_df.columns: # self.summary_compare["num_columns"] = self.summary_compare["num_columns"] - 1 else: self.summary_compare = None self.progress_bar.update(ratio_progress_of_df_summary_vs_feature) self.num_summaries = number_features # Association check if pairwise_analysis == 'auto' and \ number_features > config["Processing"].getint("association_auto_threshold"): print(f"PAIRWISE CALCULATION LENGTH WARNING: There are {number_features} features in " f"this dataframe and the " f"'pairwise_analysis' parameter is set to 'auto'.\nPairwise analysis is exponential in " f"length: {number_features} features will cause ~" f"{number_features * number_features} pairs to be " f"evaluated, which could take a long time.\n\nYou must call the function with the " f"parameter pairwise_analysis='on' or 'off' to explicitly select desired behavior." ) self.progress_bar.close() return # Validate and process TARGET target_to_process = None target_type = None if target_feature_name: # Make sure target exists self.progress_bar.set_description_str(f"Feature: {target_feature_name} (TARGET)") targets_found = [item for item in filtered_series_names_in_source if item == target_feature_name] if len(targets_found) == 0: self.progress_bar.close() raise KeyError(f"Feature '{target_feature_name}' was " f"specified as TARGET, but is NOT FOUND in " f"the dataframe (watch case-sensitivity?).") # Make sure target has no nan's if source_df[targets_found[0]].isnull().values.any(): self.progress_bar.close() raise ValueError(f"\nTarget feature '{targets_found[0]}' contains NaN (missing) values.\n" f"To avoid confusion in interpreting target distribution,\n" f"target features MUST NOT have any missing values at this time.\n") # Find Target in compared, if present compare_target_series = None if compare_df is not None: if target_feature_name in compare_df.columns: if compare_df[target_feature_name].isnull().values.any(): self.progress_bar.close() raise ValueError( f"\nTarget feature '{target_feature_name}' in COMPARED data contains NaN (missing) values.\n" f"To avoid confusion in interpreting target distribution,\n" f"target features MUST NOT have any missing values at this time.\n") compare_target_series = compare_df[target_feature_name] # TARGET processed HERE with COMPARE if present target_to_process = FeatureToProcess(-1, source_df[targets_found[0]], compare_target_series, None, None, fc.get_predetermined_type(targets_found[0])) self._target = sa.analyze_feature_to_dictionary(target_to_process) filtered_series_names_in_source.remove(targets_found[0]) target_type = self._target["type"] self.progress_bar.update(1) # Set final target series and sanitize targets (e.g. bool->truly bool) source_target_series = None compare_target_series = None if target_feature_name: if target_feature_name not in source_df.columns: raise ValueError if self._target["type"] == sa.FeatureType.TYPE_BOOL: source_target_series = self.get_sanitized_bool_series(source_df[target_feature_name]) else: source_target_series = source_df[target_feature_name] if compare_df is not None: if target_feature_name in compare_df.columns: if self._target["type"] == sa.FeatureType.TYPE_BOOL: compare_target_series = self.get_sanitized_bool_series(compare_df[ target_feature_name]) else: compare_target_series = compare_df[target_feature_name] # Create list of features to process features_to_process = [] for cur_series_name, cur_order_index in zip(filtered_series_names_in_source, range(0, len(filtered_series_names_in_source))): # TODO: BETTER HANDLING OF DIFFERENT COLUMNS IN SOURCE/COMPARE if compare_df is not None and cur_series_name in \ compare_df.columns: this_feat = FeatureToProcess(cur_order_index, source_df[cur_series_name], compare_df[cur_series_name], source_target_series, compare_target_series, fc.get_predetermined_type(cur_series_name), target_type) else: this_feat = FeatureToProcess(cur_order_index, source_df[cur_series_name], None, source_target_series, None, fc.get_predetermined_type(cur_series_name), target_type) features_to_process.append(this_feat) # Process columns -> features self.run_id = hex(int(time.time()))[2:] + "_" # removes the decimals # self.temp_folder = config["Files"].get("temp_folder") # os.makedirs(os.path.normpath(self.temp_folder), exist_ok=True) for f in features_to_process: # start = time.perf_counter() self.progress_bar.set_description_str(f"Feature: {f.source.name}") self._features[f.source.name] = sa.analyze_feature_to_dictionary(f) self.progress_bar.update(1) # print(f"DONE FEATURE------> {f.source.name}" # f" {(time.perf_counter() - start):.2f} {self._features[f.source.name]['type']}") # self.progress_bar.set_description_str('[FEATURES DONE]') # self.progress_bar.close() # Wrap up summary self.summarize_category_types(source_df, self.summary_source, fc.skip, self._target) if compare is not None: self.summarize_category_types(compare_df, self.summary_compare, fc.skip, self._target) self.dataframe_summary_html = sv_html.generate_html_dataframe_summary(self) self.graph_legend = GraphLegend(self) # Process all associations # ---------------------------------------------------- # Put target first if target_to_process is not None: features_to_process.insert(0,target_to_process) if pairwise_analysis.lower() != 'off': self.progress_bar.reset(total=len(features_to_process)) self.progress_bar.set_description_str("[Step 2/3] Processing Pairwise Features") self.process_associations(features_to_process, source_target_series, compare_target_series) self.progress_bar.reset(total=1) self.progress_bar.set_description_str("[Step 3/3] Generating associations graph") self.associations_html_source = True # Generated later in the process self.associations_html_compare = True # Generated later in the process self._association_graphs["all"] = GraphAssoc(self, "all", self._associations) self._association_graphs_compare["all"] = GraphAssoc(self, "all", self._associations_compare) self.progress_bar.set_description_str("Done! Use 'show' commands to display/save. ") self.progress_bar.update(1) else: self._associations = None self._associations_compare = None self.associations_html_source = None self.associations_html_compare = None self.progress_bar.close() return
import sweetviz.series_analyzer as sa #from sweetviz.config import config import pickle import time #temp_folder = config["Files"].get("temp_folder") # full_path_to_pickled = "../sweetviz-temp/5e52a452__click_id.pkl" full_path_to_pickled = sys.argv[1] with open(full_path_to_pickled, 'rb') as handle: feature_to_process = pickle.load(handle) # start = time.perf_counter() # print("OHHHHH:" + str(feature_to_process)) #print("OHHHHH:") analysis_dictionary = sa.analyze_feature_to_dictionary(feature_to_process) #analysis_dictionary = dict() #print(analysis_dictionary) split_source_path = os.path.split(full_path_to_pickled) full_path_to_pickled_out = os.path.join(split_source_path[0], os.path.splitext( split_source_path[1])[0] + "_out.pkl") with open(full_path_to_pickled_out, 'wb') as handle: pickle.dump(analysis_dictionary, handle) # print(f"PROCESS------> {feature_to_process.source.name}" # f" {time.perf_counter() - start}")