def __init__(self, compare_shape=True, compare_feature_names=True, compare_random_values=True): """ Args: compare_shape: bool Determines whether or not to create/compare the dataframe's shape for the snapshot compare_feature_names: bool Determines whether or not to create/compare the dataframe's the feature names for the snapshot compare_random_values: bool Determines whether or not to create/compare the dataframe's 10 'random' values found on each feature. As long as the same dataframe is passed the random values should be the same. Note: Will ignore float features because of trailing value problem that all floats have. """ # Copy values self.__compare_shape = copy.deepcopy(compare_shape) self.__compare_feature_names = copy.deepcopy(compare_feature_names) self.__compare_random_values = copy.deepcopy(compare_random_values) # Error check; must have at least one boolean if not self.__compare_shape and \ not self.__compare_feature_names and \ not self.__compare_random_values: raise UnsatisfiedRequirments( "At least one compare boolean must be " "set to True for snapshot check to properly work")
def create_elbow_models(self, model_names=["K-Means", "K-Medians", "K-Medoids", "Somsc", "Cure", "Fuzzy C-means"], repeat_operation=3, max_k_value=15, display_visuals=True): model_names = set(model_names) names_model_dict = {"K-Means":kmeans, "K-Medians":kmedians, "K-Medoids":kmedoids, "Somsc":somsc, "Cure":cure, "Fuzzy C-means": fcm} # Iterate through passed model names for name in model_names: if name in names_model_dict.keys(): # Only requires 1 elbow sequence if name == "Somsc" or name == "Cure": best_clusters = self.__create_elbow_seq(name, names_model_dict[name], repeat_operation=1, max_k_value=max_k_value, display_visuals=display_visuals) else: best_clusters = self.__create_elbow_seq(name, names_model_dict[name], repeat_operation=repeat_operation, max_k_value=max_k_value, display_visuals=display_visuals) # Save cluster results in best_clusters.sort() self.__models_suggested_clusters[name] = best_clusters self.__save_update_best_model_clusters() else: raise UnsatisfiedRequirments(f"Unknown model name passed: \"{name}\"") return best_clusters
def __init__(self, dataset_name, overwrite_full_path=None): """ Args: dataset_name: string Sub directory to create on top of the directory 'PARENT_OUTPUT_FOLDER_NAME'. overwrite_full_path: string The passed directory path must already exist. Will completely ignore the project name and attempt to point to this already created directory. """ # Setup project structure if not overwrite_full_path: parent_structure = "/" + SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME \ + "/" + dataset_name + "/" create_dir_structure(os.getcwd(), parent_structure) tmp_path = correct_directory_path( os.getcwd() + parent_structure) # Trusting the user that this path must already exist else: overwrite_full_path = correct_directory_path(overwrite_full_path) # Path doesn't contain eflow's main output if f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/" not in overwrite_full_path: raise UnsatisfiedRequirments(f"Directory path must have {SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME} " f"as a directory name or this program will not work correctly.") # Unknown path found if not os.path.exists(overwrite_full_path): raise SystemError("The path must already be defined in full on " "your system to use a different directory " "structure than orginally intended.") tmp_path = overwrite_full_path from eflow._hidden.general_objects import enum self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=tmp_path, RELATIVE_PATH_TO_OUTPUT_FOLDER=tmp_path.split(f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/")[1])
def __init__(self, object_type, segment_id=None, create_file=True): """ Args: object_type: string The child type of all object's that inherited DataPipelineSegment segment_id: string If init as a string instead of None; the object will attempt to find the json file in the provided directory. Note: Essentially we are serializing the object with json files. """ self.__json_file_name = None self.__object_type = copy.deepcopy(object_type) if not isinstance(segment_id, str) and segment_id: raise UnsatisfiedRequirments( "Segment id must be a string or set to 'None'!") if segment_id and not create_file: raise PipelineSegmentError( "Parameter conflict: segment_id is referring " "to a saved file but create_file is set to False.") # File extension removal if isinstance(segment_id, str): segment_id = segment_id.split(".")[0] self.__segment_id = copy.deepcopy(segment_id) # Pushes the functions info based on order they are called self.__function_pipe = deque() self.__create_file = create_file self.__lock_interaction = False # Attempt to get json file into object's attributes. if self.__segment_id: self.__configure_pipeline_segment_with_existing_file()
def __init__(self, df, feature_names=[], dataset_sub_dir="", dataset_name="Default Dataset Name", overwrite_full_path=None, notebook_mode=False, pca_perc=1.00): """ Args: df: pd.Dataframe pd.Dataframe dataset_sub_dir: string Sub directory to write data. dataset_name: string Main project directory overwrite_full_path: string Overwrite full directory path to a given output folder notebook_mode: bool Display and show in notebook if set to true. """ if isinstance(df, pd.DataFrame): self.__feature_names = copy.deepcopy(list(df.columns)) else: if not feature_names: raise UnsatisfiedRequirments("If passing in a matrix like object. " "You must init feature names!") else: self.__feature_names = copy.deepcopy(feature_names) AutoModeler.__init__(self, f'{dataset_name}/{dataset_sub_dir}', overwrite_full_path) # Define model self.__cluster_models_paths = dict() self.__notebook_mode = copy.deepcopy(notebook_mode) self.__models_suggested_clusters = dict() self.__pca = None self.__first_scaler = None self.__second_scaler = None self.__cutoff_index = None self.__ordered_dp_indexes = None self.__pca_perc = pca_perc # --- Apply pca --- if pca_perc: # Create scaler object scaler = StandardScaler() scaled = scaler.fit_transform(df) self.__first_scaler = copy.deepcopy(scaler) print("\nInspecting scaled results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=self.__feature_names, sub_dir="PCA", filename="Applied scaler results") pca, scaled = self.__visualize_pca_variance(scaled) self.__pca = pca # Generate "dummy" feature names pca_feature_names = ["PCA_Feature_" + str(i) for i in range(1, len(self.__feature_names) + 1)] print("\nInspecting applied scaler and pca results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names, sub_dir="PCA", filename="Applied scaler and PCA results") if pca_perc < 1.0: # Find cut off point on cumulative sum cutoff_index = np.where( pca.explained_variance_ratio_.cumsum() > pca_perc)[0][0] else: cutoff_index = scaled.shape[1] - 1 print( "After applying pca with a cutoff percentage {0}%" " for the cumulative index. Using features 1 to {1}".format( pca_perc, cutoff_index + 1)) print("Old shape {0}".format(scaled.shape)) scaled = scaled[:, :cutoff_index + 1] pca_feature_names = pca_feature_names[0: cutoff_index + 1] print("New shape {0}".format(scaled.shape)) scaled = scaler.fit_transform(scaled) print("\nInspecting data after final scaler applied!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names, sub_dir="PCA", filename="Applied final sclaer to process.") self.__second_scaler = copy.deepcopy(scaler) self.__scaled = scaled self.__cutoff_index = cutoff_index # Assumed PCA has already been applied; pass as matrix else: self.__scaled = df.values # Save objects to directory structure if self.__pca: pipeline_path = create_dir_structure(self.folder_path, "Data Cluster Pipeline") # Pickle data pipeline objects pickle_object_to_file(self.__pca, pipeline_path, "PCA") pickle_object_to_file(self.__first_scaler, pipeline_path, "First Scaler") pickle_object_to_file(self.__second_scaler, pipeline_path, "First Scaler") pickle_object_to_file(self.__pca_perc, pipeline_path, "PCA Percentage") # Save Dimensions and Cutoff Index write_object_text_to_file(self.__cutoff_index, pipeline_path, "Cutoff Index") write_object_text_to_file(self.__cutoff_index + 1, pipeline_path, "Dimensions")
def add(self, segment_name, pipeline_segment_obj): """ segment_name (str): A aliased name to refer to this segment. pipeline_segment_obj (child of DataPipelineSegment): A child object of type DataPipelineSegment. Returns: Attempts to add a pipeline segment object to the objects que and update it's related json object. """ # pipeline_segment_obj = copy.deepcopy(pipeline_segment_obj) # Type check if not isinstance(pipeline_segment_obj, DataPipelineSegment): raise UnsatisfiedRequirments( f"Expected a 'DataPipelineSegment' object; received '{type(pipeline_segment_obj)}'" ) # Check if alias has already been used if segment_name in self.__pipeline_segment_names: raise PipelineError( f"The '{segment_name}' pipeline segment is already in this pipeline. Please choose a different segment name." ) try: # Check if the pipeline segment has already been used segment_path_id = pipeline_segment_obj.relative_folder_path + pipeline_segment_obj.file_name except AttributeError: raise UnsatisfiedRequirments( "The given pipeline segment didn't perform any functions.") if segment_path_id in self.__pipeline_segment_path_id: raise PipelineError( "The segment has been already found " "in this pipeline Segment path id: " + f"'{segment_path_id}.'\n" + "This can be done by:" "\n\t*Creating a completely new segment object " "and adding it to the pipeline with the 'add'" " method." "\n\t*Refer to a different segment path id") else: # Que has yet to have data pushed; set up output directory for new data if len(self.__pipeline_segment_deque) == 0: FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline/{self.__pipeline_name}' ) if os.path.exists(self.folder_path + "df_features.json"): self.__df_features.init_on_json_file(self.folder_path + "df_features.json") else: self.__df_features.create_json_file_representation( self.folder_path, "df_features.json") # Update data types for error checking self.__pipeline_segment_names.add(segment_name) self.__pipeline_segment_path_id.add(segment_path_id) # Main component of the project self.__pipeline_segment_deque.append( (segment_name, segment_path_id, pipeline_segment_obj)) # Lock down the object to prevent users from continuing to interact with it after adding it to the pipeline pipeline_segment_obj._DataPipelineSegment__lock_interaction = True # Update/Create the json file self.__create_json_pipeline_file()
def make_dummies(self, df, df_features, qualitative_features=[], _feature_values_dict=None, _add_to_que=True): """ Create dummies features of based on qualtative feature data and removes the original feature. Note _feature_values_dict does not need to be init. Used for backend resource. Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. qualtative_features: collection of strings Feature names to convert the feature data into dummy features. _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ # Convert to the correct types if isinstance(qualitative_features, str): qualtative_features = [qualitative_features] if not _feature_values_dict: _feature_values_dict = dict() pd.set_option('mode.chained_assignment', None) for cat_feature in qualitative_features: if cat_feature not in df_features.string_features( ) | df_features.categorical_features(): raise UnsatisfiedRequirments( f"No feature named '{cat_feature}' in categorical or string features." ) if cat_feature not in _feature_values_dict: _feature_values_dict[cat_feature] = df[cat_feature].dropna( ).unique() _feature_values_dict[cat_feature].sort() _feature_values_dict[cat_feature] = _feature_values_dict[ cat_feature].tolist() dummy_features = [] for feature_value in _feature_values_dict[cat_feature]: new_feature = cat_feature + f"_{feature_value}" bool_array = df[cat_feature] == feature_value df[new_feature] = copy.deepcopy(bool_array) dummy_features.append(new_feature) # # Make dummies and remove original feature # dummies_df = pd.get_dummies(_feature_values_dict[cat_feature], # prefix=cat_feature) df.drop(columns=[cat_feature], inplace=True) df_features.remove_feature(cat_feature) df_features.set_feature_to_dummy_encoded(cat_feature, dummy_features) # # Apply to dataframe # for feature_name in dummies_df.columns: # df[feature_name] = dummies_df[feature_name] if _add_to_que: params_dict = locals() parameters = get_parameters(self.make_dummies) self._DataPipelineSegment__add_function_to_que( "make_dummies", parameters, params_dict)