Пример #1
0
    def __init__(self, directory):
        self.config = Config()
        self.logger = Logger(__name__)
        self.logger.info(f"Loading data from {directory}...")
        self.target_column_name = self.config.target_column
        self.feature_column_names = None
        self.is_single_file = self.config.csv_name is not None

        self.raw_data = self.read_all_data(directory)
        if self.raw_data is None: return
        self.logger.info(f"Done read [{len(self.raw_data)}] points.")
Пример #2
0
    def __init__(self, directory):
        self.config = Config()
        self.logger = Logger(__name__)
        self.logger.info(f"Loading data from {directory}...")
        self.target_column_name = self.config.target_column
        self.feature_column_names = None
        self._label_encoder = self.load_labelencoder()
        self.is_single_file = self.config.csv_name is not None

        self.raw_data = self.read_all_data(directory)
        if self.raw_data is None: return

        nclasses = self.num_classes()
        self.logger.info(
            f"Done read [{len(self.raw_data)}] points with [{nclasses}] classes."
        )

        if nclasses == 1:
            self.logger.info("Data must have at least 2 classes.")
            self.raw_data = None
Пример #3
0
class DataManager():
    def __init__(self, directory):
        self.config = Config()
        self.logger = Logger(__name__)
        self.logger.info(f"Loading data from {directory}...")
        self.target_column_name = self.config.target_column
        self.feature_column_names = None
        self._label_encoder = self.load_labelencoder()
        self.is_single_file = self.config.csv_name is not None

        self.raw_data = self.read_all_data(directory)
        if self.raw_data is None: return

        nclasses = self.num_classes()
        self.logger.info(
            f"Done read [{len(self.raw_data)}] points with [{nclasses}] classes."
        )

        if nclasses == 1:
            self.logger.info("Data must have at least 2 classes.")
            self.raw_data = None

    def read_all_data(self, directory):
        dataframe_from_csv = self.read_all_csv(directory)

        if self.is_single_file:
            if dataframe_from_csv is None:
                self.logger.info(
                    f"Not able to read any valid csv data in [{os.path.join(directory, self.config.csv_name)}]"
                )
                return None
            return dataframe_from_csv
        else:
            if dataframe_from_csv is None:
                self.logger.info(
                    f"Unable to read any valid data from *.csv files in [{directory}]"
                )
                return None

        if dataframe_from_csv[self.target_column_name].dtype == 'object':
            label = dataframe_from_csv.loc[:, self.target_column_name]
            dataframe_from_csv.loc[:, self.
                                   target_column_name] = self._label_encoder.fit_transform(
                                       label)

            joblib.dump(
                self._label_encoder,
                os.path.join(self.config.cur_dir, "model", 'LabelEncoder.sav'))
            self.logger.info(
                f"Encoded label column ['{self.target_column_name}']")

        return dataframe_from_csv

    def read_all_csv(self, directory):

        help_string = " The csv file must contain a header, a target column and and at least one feature column." \
            " The target column name is set by the <input_column> variable of this run. The default value is 'target'."

        paths = []
        if self.is_single_file:
            paths = [os.path.join(directory, self.config.csv_name)]
        else:
            paths = glob.glob(os.path.join(directory, "*.csv"), recursive=True)

        frames = []
        for path in paths:
            try:
                self.logger.verbose(
                    f"Attempting to read data from csv [{path}]"
                    f" with delimiter [{self.config.delimiter}]")
                frame = pd.read_csv(path,
                                    error_bad_lines=False,
                                    delimiter=self.config.delimiter,
                                    encoding=self.config.encoding)
            except Exception as e:
                self.logger.info(
                    f"Failed to read csv [{path}] exception:\n{e}")
                continue

            if self.target_column_name not in frame.columns:
                self.logger.info(
                    f"File [{path}] does not have name [{self.target_column_name}] in header"
                    f"{list(frame.columns)}', skipping this file." +
                    help_string)
                continue

            frames.append(frame)
            self.logger.verbose(
                f"Read [{len(frame)}] data points from [{path}]\n")

        if len(frames) == 0: return None
        coalesced = reduce(lambda a, b: a.append(b), frames[1:], frames[0])

        return coalesced

    def validate(self, for_train=True):
        if self.raw_data is None: return False
        if not for_train: return True

        # Validates if all classes have at least some number of minium training examples
        min_count_per_class = int(
            len(self.get_data()) * self.config.class_percentage_validation)

        target_value_counts = self.get_data()[
            self.target_column_name].value_counts()
        not_enough_examples = target_value_counts[
            target_value_counts < min_count_per_class]

        if len(not_enough_examples) == 0: return True

        help_string = (
            f"Provided data does not have enough training examples to train,"
            f"you must provide at least {min_count_per_class} training examples of each class"
        )

        for class_name, number_examples in not_enough_examples.iteritems():
            self.logger.info(
                f"Class [{class_name}] has [{number_examples}] data points.")

        self.logger.info(help_string)
        return False

    @staticmethod
    def write_dataframe(frame, name, directory=None):
        config = Config()
        if directory is None:
            path = os.path.join(config.artifacts_directory, f"{name}.csv")
        else:
            path = os.path.join(directory, f"{name}.csv")
        frame.to_csv(path, index=False, sep=config.delimiter)
        checksum = DataManager.checksum(path)
        return checksum

    @staticmethod
    def checksum(path):
        hasher = hashlib.md5()
        with open(path, 'rb') as infile:
            hasher.update(infile.read())
        return hasher.hexdigest()

    def get_classes(self):
        return self.raw_data[self.target_column_name].unique()

    def num_classes(self):
        return len(self.get_classes())

    def get_data(self):
        return self.raw_data

    def get_target_column(self):
        return self.target_column_name

    def get_feature_columns(self):
        self.feature_column_names = list(
            self.raw_data.drop(self.target_column_name, axis=1).columns)
        return self.feature_column_names

    def load_labelencoder(self):
        if os.path.isfile(
                os.path.join(self.config.cur_dir, "model",
                             "LabelEncoder.sav")):
            self.logger.info(f"Loading label encoder...")
            return joblib.load(
                os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav"))
        else:
            return LabelEncoder()
 def __init__(self, is_infer_only=False):
     self.config = Config()
     self.logger = Logger(__name__)
     self._model = self.load_model()
     self._label_encoder = self.load_labelencoder()
class Model():
    def __init__(self, is_infer_only=False):
        self.config = Config()
        self.logger = Logger(__name__)
        self._model = self.load_model()
        self._label_encoder = self.load_labelencoder()

    def train(self, directory):

        dm = DataManager(directory)

        if not dm.validate():
            raise UiPathUsageException("No valid data to run this pipeline.")

        data_df = dm.get_data()
        X = data_df[dm.get_feature_columns()].values
        y = data_df[dm.get_target_column()].values

        help_string = "Warning: You have retrained a model which was generated by a TPOT optimization pipeline.\
        \nFor optimal results please run the TPOT optimization pipeline from scratch by training package version [1.0]."

        if not self.is_trained() or self.config.warm_start == True:
            self._model = self.build_model(X, y)
        else:
            self._model.fit(X, y)
            self.logger.info(f"Finished retraining model.")
            self.logger.info(help_string)

        joblib.dump(self._model,
                    os.path.join(self.config.cur_dir, "model", "Model.sav"))

    def evaluate(self, evaluation_directory):

        dm = DataManager(evaluation_directory)
        data_df = dm.get_data()

        if not dm.validate(for_train=False):
            self.logger.info(
                "No valid test data to run this evaluation pipeline.")

        data_df = dm.get_data()
        X = data_df[dm.get_feature_columns()].values
        y = data_df[dm.get_target_column()].values

        if not self.is_trained():
            self.logger.info(_UNTRAINED_HELP)
        else:
            score = self._model.score(X, y)
            self.logger.info(f"Evaluation score = {score}")
            return score

    def process_data(self, directory):

        if not self.config.test_data_from_ui:
            dm = DataManager(directory)

            if not dm.validate():
                raise UiPathUsageException(
                    "No valid data to run this pipeline.")

            all_data = dm.get_data()

            # Stratified split
            percentage = self.config.process_data_split_percentage

            train, test = train_test_split(
                all_data,
                test_size=percentage,
                random_state=self.config.seed,
            )

            # Write train.csv
            DataManager.write_dataframe(train, 'train',
                                        self.config.train_data_directory)

            # Write evaluate.csv
            DataManager.write_dataframe(test, 'test',
                                        self.config.test_data_directory)
        else:
            dm = DataManager(directory)

            if not dm.validate():
                raise UiPathUsageException(
                    "No valid data to run this pipeline.")

            all_data = dm.get_data()

            # Write train.csv
            DataManager.write_dataframe(all_data, 'train',
                                        self.config.train_data_directory)
            self.logger.info(
                "Did not split data into train and test sets. Model will be evaluated on data selected from UI."
            )

    def build_model(self, X, y):
        # Perform missing value imputation as scikit-learn models can't handle NaN's
        nan_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
        X = nan_imputer.fit_transform(X)

        pipeline_optimizer = TPOTRegressor(
            generations=self.config.generations,
            population_size=self.config.population_size,
            offspring_size=self.config.offspring_size,
            mutation_rate=self.config.mutation_rate,
            crossover_rate=self.config.crossover_rate,
            scoring=self.config.scoring,
            cv=self.config.cv,
            subsample=self.config.subsample,
            n_jobs=-1,
            max_time_mins=self.config.max_time_mins,
            max_eval_time_mins=self.config.max_eval_time_mins,
            random_state=self.config.seed,
            config_dict=self.config.classifier_config_dict,
            warm_start=self.config.warm_start,
            memory=self.config.artifacts_directory,
            verbosity=1)

        # Fit TPOT to data
        pipeline_optimizer.fit(X, y)
        self.logger.info(f"Finished running TPOT optimization pipeline.")

        # Export fitted pipeline to artifacts directory
        pipeline_path = os.path.join(self.config.artifacts_directory,
                                     "TPOT_pipeline.py")
        pipeline_optimizer.export(pipeline_path)
        self.logger.info(f"Saving best pipeline to {pipeline_path}")

        # Create new pipeline which contains nan_imputer
        pipe = Pipeline([
            ("nan_imputer", nan_imputer),
            ("tpot_pipeline", pipeline_optimizer.fitted_pipeline_),
        ])
        return pipe

    def predict(self, mlskill_input):

        data = pd.read_json(mlskill_input)
        predictions = self._model.predict(data.values)
        return json.dumps(predictions.tolist())

    def load_model(self):
        if os.path.isfile(
                os.path.join(self.config.cur_dir, "model", "Model.sav")):
            self.logger.info(f"Loading pre-trained model...")
            return joblib.load(
                os.path.join(self.config.cur_dir, "model", "Model.sav"))
        else:
            return None

    def load_labelencoder(self):
        if os.path.isfile(
                os.path.join(self.config.cur_dir, "model",
                             "LabelEncoder.sav")):
            self.logger.info(f"Loading label encoder...")
            return joblib.load(
                os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav"))
        else:
            return None

    def is_trained(self):
        if self._model is None:
            return False
        else:
            return True
Пример #6
0
class DataManager():
    def __init__(self, directory):
        self.config = Config()
        self.logger = Logger(__name__)
        self.logger.info(f"Loading data from {directory}...")
        self.target_column_name = self.config.target_column
        self.feature_column_names = None
        self.is_single_file = self.config.csv_name is not None

        self.raw_data = self.read_all_data(directory)
        if self.raw_data is None: return
        self.logger.info(f"Done read [{len(self.raw_data)}] points.")

    def read_all_data(self, directory):
        dataframe_from_csv = self.read_all_csv(directory)

        if self.is_single_file:
            if dataframe_from_csv is None:
                self.logger.info(
                    f"Not able to read any valid csv data in [{os.path.join(directory, self.config.csv_name)}]"
                )
                return None
            return dataframe_from_csv
        else:
            if dataframe_from_csv is None:
                self.logger.info(
                    f"Unable to read any valid data from *.csv files in [{directory}]"
                )
                return None

        return dataframe_from_csv

    def read_all_csv(self, directory):

        help_string = " The csv file must contain a header, a target column and and at least one feature column." \
            " The target column name is set by the <input_column> variable of this run. The default value is 'target'."

        paths = []
        if self.is_single_file:
            paths = [os.path.join(directory, self.config.csv_name)]
        else:
            paths = glob.glob(os.path.join(directory, "*.csv"), recursive=True)

        frames = []
        for path in paths:
            try:
                self.logger.verbose(
                    f"Attempting to read data from csv [{path}]"
                    f" with delimiter [{self.config.delimiter}]")
                frame = pd.read_csv(path,
                                    error_bad_lines=False,
                                    delimiter=self.config.delimiter,
                                    encoding=self.config.encoding)
            except Exception as e:
                self.logger.info(
                    f"Failed to read csv [{path}] exception:\n{e}")
                continue

            if self.target_column_name not in frame.columns:
                self.logger.info(
                    f"File [{path}] does not have name [{self.target_column_name}] in header"
                    f"{list(frame.columns)}', skipping this file." +
                    help_string)
                continue

            frames.append(frame)
            self.logger.verbose(
                f"Read [{len(frame)}] data points from [{path}]\n")

        if len(frames) == 0: return None
        coalesced = reduce(lambda a, b: a.append(b), frames[1:], frames[0])

        return coalesced

    def validate(self, for_train=True):
        if self.raw_data is None:
            return False
        elif not for_train:
            return True
        else:
            return True

    @staticmethod
    def write_dataframe(frame, name, directory=None):
        config = Config()
        if directory is None:
            path = os.path.join(config.artifacts_directory, f"{name}.csv")
        else:
            path = os.path.join(directory, f"{name}.csv")
        frame.to_csv(path, index=False, sep=config.delimiter)
        checksum = DataManager.checksum(path)
        return checksum

    @staticmethod
    def checksum(path):
        hasher = hashlib.md5()
        with open(path, 'rb') as infile:
            hasher.update(infile.read())
        return hasher.hexdigest()

    def get_data(self):
        return self.raw_data

    def get_target_column(self):
        return self.target_column_name

    def get_feature_columns(self):
        self.feature_column_names = list(
            self.raw_data.drop(self.target_column_name, axis=1).columns)
        return self.feature_column_names