class Cardea(): """An interface class that ties the end-to-end system together. Attributes: es_loader: An entityset loader. featurization: A featurization class. modeler: A modeling class. problems: A list of currently available prediction problems. chosen_problem: The selected prediction problem or regression. es: The loaded entityset. target_entity: The target entity for featurization. """ def __init__(self): self.es_loader = EntitySetLoader() self.featurization = Featurization() self.modeler = Modeler() self.es = None self.chosen_problem = None self.target_entity = None def load_data_entityset(self, folder_path=None): """Returns an entityset loaded with .csv files in folder_path. Load the given dataset within the folder path into an entityset. The dataset must be in a FHIR structure format. If no folder_path is not passed, the function will automatically load kaggle's missed appointment dataset. Args: folder_path: A directory of all .csv files that should be loaded. Returns: An entityset with loaded data. """ if folder_path: self.es = self.es_loader.load_data_entityset(folder_path) else: csv_s3 = "https://s3.amazonaws.com/dai-cardea/" kaggle = [ 'Address', 'Appointment_Participant', 'Appointment', 'CodeableConcept', 'Coding', 'Identifier', 'Observation', 'Patient', 'Reference' ] fhir = { resource: pd.read_csv(csv_s3 + resource + ".csv") for resource in kaggle } self.es = self.es_loader.load_df_entityset(fhir) def list_problems(self): """Returns a list of the currently available problems. Returns: A set of the available problems. """ problems = set([]) for attribute_string in dir(cardea.problem_definition): attribute = getattr(cardea.problem_definition, attribute_string) if isclass(attribute): if attribute.__name__ and attribute.__name__ != 'ProblemDefinition': problems.add(attribute.__name__) return problems def select_problem(self, selection, parameter=None): """Select a prediction problem and extract information. Update the select_problem attribute and generate the cutoff times, the target entity and update the entityset. Args: selection: Name of the chosen prediction problem. data: Entityset representation of the data. parameters: A variable to change the default parameters, if any. Returns: The updated version of the entityset and cutoff time label. """ # problem selection if selection == 'LengthOfStay': self.chosen_problem = LengthOfStay() elif selection == 'MortalityPrediction': self.chosen_problem = MortalityPrediction() elif selection == 'MissedAppointmentProblemDefinition': self.chosen_problem = MissedAppointmentProblemDefinition() elif selection == 'ProlongedLengthOfStay' and parameter: self.chosen_problem = ProlongedLengthOfStay(parameter) elif selection == 'ProlongedLengthOfStay': self.chosen_problem = ProlongedLengthOfStay() elif selection == 'Readmission' and parameter: self.chosen_problem = Readmission(parameter) elif selection == 'Readmission': self.chosen_problem = Readmission() elif selection == 'DiagnosisPrediction' and parameter: self.chosen_problem = DiagnosisPrediction(parameter) elif selection == 'DiagnosisPrediction': raise ValueError('unspecified diagnosis code') else: raise ValueError('{} is not a defined problem'.format(selection)) # target label calculation self.es, self.target_entity, cutoff = self.chosen_problem.generate_cutoff_times( self.es) return cutoff def list_feature_primitives(self): """Returns built-in primitive in Featuretools. Returns: A pandas dataframe that lists and describes each built-in primitives. """ return ft.list_primitives() def generate_features(self, cutoff): """Returns a the calculated feature matrix. Args: es: A featuretools entityset that holds data. cutoff: A pandas dataframe that indicates cutoff_time for each instance. Returns: Encoded feature_matrix, encoded features. """ fm_encoded, _ = self.featurization.generate_feature_matrix( self.es, self.target_entity, cutoff) fm_encoded = fm_encoded.reset_index(drop=True) return fm_encoded def execute_model(self, feature_matrix, target, primitives, optimize=False, hyperparameters=None): """Executes and predict all the pipelines. This method executes the given pipeline and returns a list for all the pipelines with the result of each fold with its associated predicted values and actual values. Args: data_frame: A dataframe, which encapsulates all the records of that entity. primitives_list: A list of the primitives within a pipeline. optimize: A boolean value which indicates whether to optimize the model or not. hyperparameters: A dictionary of hyperparameters for each primitives. Returns: A list for all the executed pipelines and its result. """ return self.modeler.execute_pipeline( data_frame=feature_matrix, target=target, primitives_list=primitives, problem_type=self.chosen_problem.prediction_type, optimize=False, hyperparameters=None) def convert_to_json(dic): """Converts a given dictionary to json format. Args: dict: A dictionary of values to be coverted. Returns: A string in json format. """ return json.dumps(dic) def convert_from_json(string): """Converts a given json string to dictionary format. Args: json: A dictionary of values to be coverted. Returns: A parsed dictionary. """ return json.loads(string)
class Cardea(): """An interface class that ties the end-to-end system together. Args: es_loader (EntitySetLoader): An entityset loader. featurization (Featurization): A featurization class. modeler (Modeler): A modeling class. problems (list): A list of currently available prediction problems. chosen_problem (str): The selected prediction problem or regression. es (featuretools.EntitySet): The loaded entityset. target_entity (str): The target entity for featurization. """ def __init__(self): self.es_loader = EntitySetLoader() self.featurization = Featurization() self.es = None self.chosen_problem = None self.target_entity = None self.modeler = None def load_entityset(self, data, fhir=True): """Returns an entityset loaded with .csv files in data. Load the given dataset into an entityset. The dataset must be in FHIR or MIMIC structure format. Args: data (str): A directory of all .csv files that should be loaded. To load demo dataset, pass the name of the dataset "kaggle" or "mimic". fhir (bool): An indicator of whether to use FHIR or MIMIC schema. Returns: featuretools.EntitySet: An entityset with loaded data. """ demo = ['kaggle', 'mimic'] if not os.path.exists(data) and data in demo: data = self.download_demo(data) if fhir: self.es = self.es_loader.load_data_entityset(data) else: self.es = load_mimic_data(data) @staticmethod def download_demo(name, data_path=DATA_PATH): data_path = os.path.join(data_path, name) os.makedirs(data_path, exist_ok=True) url = S3_URL.format(BUCKET, '{}.zip'.format(name)) compressed = ZipFile(BytesIO(urlopen(url).read())) LOGGER.info('Downloading dataset %s from %s', name, url) for file in compressed.namelist(): filename = os.path.join(data_path, file) csv_file = compressed.open(file) data = pd.read_csv(csv_file, dtype=str) data.to_csv(filename, index=False) return data_path def list_problems(self): """Returns a list of the currently available problems. Returns: list: A list of the available problems. """ problems = set([]) for attribute_string in dir(cardea.problem_definition): attribute = getattr(cardea.problem_definition, attribute_string) if isclass(attribute): if attribute.__name__ and attribute.__name__ != 'ProblemDefinition': problems.add(attribute.__name__) return problems def select_problem(self, selection, parameter=None): """Select a prediction problem and extract information. Update the select_problem attribute and generate the cutoff times, the target entity and update the entityset. Args: selection (str): Name of the chosen prediction problem. parameters (dict): Variables to change the default parameters, if any. Returns: featuretools.EntitySet, str, pandas.DataFrame: * An updated EntitySet if a new column is generated. * A string indicating the selected target entity. * A dataframe of cutoff times and their target labels. """ LOGGER.info("Selecting %s prediction problem", selection) # problem selection if selection == 'LengthOfStay': self.chosen_problem = LengthOfStay() elif selection == 'MortalityPrediction': self.chosen_problem = MortalityPrediction() elif selection == 'MissedAppointment': self.chosen_problem = MissedAppointment() elif selection == 'ProlongedLengthOfStay' and parameter: self.chosen_problem = ProlongedLengthOfStay(parameter) elif selection == 'ProlongedLengthOfStay': self.chosen_problem = ProlongedLengthOfStay() elif selection == 'Readmission' and parameter: self.chosen_problem = Readmission(parameter) elif selection == 'Readmission': self.chosen_problem = Readmission() elif selection == 'DiagnosisPrediction' and parameter: self.chosen_problem = DiagnosisPrediction(parameter) elif selection == 'DiagnosisPrediction': raise ValueError('unspecified diagnosis code') else: raise ValueError('{} is not a defined problem'.format(selection)) # target label calculation self.es, self.target_entity, cutoff = self.chosen_problem.generate_cutoff_times(self.es) # set default pipeline if self.chosen_problem.prediction_type == "classification": pipeline = "Random Forest" else: pipeline = "Random Forest Regressor" self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) return cutoff def list_feature_primitives(self): """Returns built-in primitive in Featuretools. Returns: pandas.DataFrame: A dataframe that lists and describes each built-in primitives. """ return ft.list_primitives() def generate_features(self, cutoff): """Returns a the calculated feature matrix. Args: es (featuretools.EntitySet): An entityset that holds data. cutoff (pandas.DataFrame): A dataframe that indicates cutoff time for each instance. Returns: pandas.DataFrame, list: * The generated feature matrix. * List of feature definitions in the feature matrix. """ fm_encoded, _ = self.featurization.generate_feature_matrix( self.es, self.target_entity, cutoff) fm_encoded = fm_encoded.reset_index(drop=True) return fm_encoded def select_pipeline(self, pipeline): """Select a pipeline. Args: pipeline (MLPipeline or str): A pipeline instance or the name/path of a pipeline. """ LOGGER.info("Selecting %s pipeline", pipeline) self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) def train_test_split(self, X, y, test_size, shuffle): """Split the training dataset and the testing dataset. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series or ndarray): Target values. test_size (float): The proportion of the dataset to include in the test dataset. shuffle (bool): Whether or not to shuffle the data before splitting. Returns: list: List containing the train-test split of the inputs and targets. """ return self.modeler.train_test_split(X, y, test_size, shuffle) def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): """Train the cardea pipeline. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series ndarray): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. max_evals (int): Maximum number of hyper-parameter optimization iterations. scoring (str): The name of the scoring function used in the hyper-parameter optimization. verbose (bool): Whether to log information during processing. """ self.modeler.fit(X, y, tune, max_evals, scoring, verbose) def predict(self, X): """Get predictions from the cardea pipeline. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. Returns: ndarray: Predictions to the input data. """ return self.modeler.predict(X) def fit_predict(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): """Train a cardea pipeline then make predictions. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series or ndarray): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. max_evals (int): Maximum number of hyper-parameter optimization iterations. scoring (str): The name of the scoring function used in the hyper-parameter optimization. verbose (bool): Whether to log information during processing. Returns: ndarray: Predictions to the input data. """ return self.modeler.fit_predict(X, y, tune, max_evals, scoring, verbose) def evaluate(self, X, y, test_size=0.2, shuffle=True, tune=False, max_evals=10, scoring=None, metrics=None, verbose=False): """Evaluate the cardea pipeline. Args: X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series or ndarray): Target values. test_size (float): The proportion of the dataset to include in the test dataset. shuffle (bool): Whether or not to shuffle the data before splitting. tune (bool): Whether to optimize hyper-parameters of the pipelines. max_evals (int): Maximum number of hyper-parameter optimization iterations. scoring (str): The name of the scoring function used in the hyper-parameter optimization. metrics (list): A list of scoring function names. The scoring functions should be consistent with the problem type. verbose (bool): Whether to log information during processing. """ return self.modeler.evaluate( X, y, test_size, shuffle, tune, max_evals, scoring, metrics, verbose) def save(self, path): """Save this object using pickle. Args: path (str): Path to the file where the serialization of this object will be stored. """ os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as pickle_file: pickle.dump(self, pickle_file) @classmethod def load(cls, path: str): """Load an Orion instance from a pickle file. Args: path (str): Path to the file where the instance has been previously serialized. Returns: Cardea: A Cardea instance Raises: ValueError: If the serialized object is not an Cardea instance. """ with open(path, 'rb') as pickle_file: cardea = pickle.load(pickle_file) if not isinstance(cardea, cls): raise ValueError('Serialized object is not a Cardea instance') return cardea