Пример #1
0
def pycaret_model(get_pycaret_data) -> t.Any:
    # note: silent must be set to True to avoid the confirmation input of data types
    train_data, _ = get_pycaret_data
    pycaret_setup(data=train_data, target="default", session_id=123, silent=True)
    dt = create_model("dt")
    tuned_dt = tune_model(dt)
    final_dt = finalize_model(tuned_dt)

    return final_dt
 def _classify(self, combined_data):
     #TODO, check for ordinal and categorical features
     self._classifier_setup(combined_data)
     # Train classifier
     classifier = classification.create_model(self.task.pycaret_model,
                                              verbose=False)
     # Store Classifier
     if self.task.output_dir:
         self._store_classifier(classifier)
     # Predict on Test set
     predictions = classification.predict_model(
         classifier, verbose=False)  # TODO get raw_scores for AUC
     return predictions
Пример #3
0
    def fit(self,
            train: pd.DataFrame,
            test: pd.DataFrame,
            target: str = "name",
            finetune: bool = False,
            text_feature: str = "text",
            **kwargs) -> Pipeline:
        """Trains and finetunes model for project prediction.

        Args:
            train (pd.DataFrame): training data
            test (pd.DataFrame): test dataset
            finetune (bool, optional): Performs model finetuning if selected. Defaults to False.

        Returns:
            Pipeline: trained sklearn pipeline
        """

        text_pipeline = Pipeline([
            ('vect', CountVectorizer(lowercase=True)),
            ('tfidf', TfidfTransformer()),
        ])
        custom_transformer = make_column_transformer(
            (text_pipeline, text_feature),
            (OneHotEncoder(handle_unknown="ignore"),
             make_column_selector(dtype_include=object)))

        self.clf = setup(train,
                         target=target,
                         test_data=test,
                         session_id=123,
                         custom_pipeline=custom_transformer,
                         preprocess=False,
                         numeric_features=["duration", "attendee_cnt"],
                         silent=True,
                         **kwargs)

        model = create_model('svm', fold=3)
        if finetune:
            model = tune_model(model,
                               search_library="optuna",
                               search_algorithm="tpe",
                               n_iter=200,
                               fold=3)

        final_model = finalize_model(model)

        self.pipeline, self.filename = save_model(final_model, "trained_model")
        return self.pipeline
import pandas as pd
import numpy
from pycaret.classification import setup, create_model, tune_model, save_model

train_data = pd.read_csv("../data/HR_training_data.csv")

#initializing pycaret environment
employee_class = setup(data=train_data, target='left', session_id=123)

#creating model
lightgbm = create_model('lightgbm')

#tuned the model by optimizing on AUC
tuned_lightgbm = tune_model(lightgbm, optimize='AUC')

#saving the model
save_model(tuned_lightgbm, '../model/employees_churn_model')
from pycaret.classification import compare_models
from pandas import read_csv

#data = read_csv('data/envtrain_xv.csv')
data = read_csv('data_2.0/envtrain_xv.csv')

#data = data.drop(['Unnamed: 0'], axis=1)
exp_clf = setup(data,
                target='pa',
                log_experiment=True,
                experiment_name='xv-21',
                session_id=110,
                numeric_features=['bclim14'])

# create models
etrees = create_model('et')
xgboost = create_model('xgboost')
catboost = create_model('catboost')
rf = create_model('rf')
lgbm = create_model('lightgbm')
log = create_model('lr')

# save models as .pkl files
finalize_model(etrees)
save_model(etrees, 'classifier_models(pkl)/xant_etrees')

finalize_model(xgboost)
save_model(xgboost, 'classifier_models(pkl)/xant_xgb')

finalize_model(catboost)
save_model(catboost, 'classifier_models(pkl)/xant_cboost')
Пример #6
0
def app_main():
    st.title("Machine learning analysis platform")
    if st.sidebar.checkbox('Define Data Source'):
        filesFolder = st.sidebar.text_input('folder', value="data")
        dataList = list_files(filesFolder, 'csv')
        if len(dataList) == 0:
            st.warning('No data set available')
        else:
            file_selected = st.sidebar.selectbox('Select a document', dataList)
            file_selected_path = concat_file_path(filesFolder, file_selected)
            nrows = st.sidebar.number_input('Number of lines', value=-1)
            n_rows_str = 'All' if nrows == -1 else str(nrows)
            st.info(
                'Selected file:{file_selected_path},The number of rows read is{n_rows_str}'
            )
    else:
        file_selected_path = None
        nrows = 100
        st.warning('The currently selected file is empty, please select:')
    if st.sidebar.checkbox('Exploratory Analysis'):
        if file_selected_path is not None:
            if st.sidebar.button('Report Generation'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info('No file selected, analysis cannot be performed')
    if st.sidebar.checkbox('Modeling'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('Select Task', ML_LIST)
            if task == 'Regression':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            elif task == 'Classification':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('Select Prediction Object',
                                                  cols)
            except BaseException:
                st.sidebar.warning('The data format cannot be read correctly')
                target_col = None

            if target_col is not None and st.sidebar.button('Training Model'):
                if task == 'Regression':
                    st.success('Data preprocessing...')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete')
                    st.success('Training model. . .')
                    pc_rg.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_rg.finalize_model(model)
                    st.success('Model has been created')
                elif task == 'Classification':
                    st.success('Data preprocessing. . .')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete.')
                    st.success('Training model. . .')
                    pc_cl.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_cl.finalize_model(model)
                    st.success('Model has been created')

    if st.sidebar.checkbox('View System Log'):
        n_lines = st.sidebar.slider(label='Number of lines',
                                    min_value=3,
                                    max_value=50)
        if st.sidebar.button("Check View"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('System log')
            st.write(logs)
    try:
        allOfRuns = mlflow.search_runs(experiment_ids=0)
    except:
        allOfRuns = []
    if len(allOfRuns) != 0:
        if st.sidebar.checkbox('Preview model'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->Open mlflow, enter the command line: mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(allOfRuns)
        if st.sidebar.checkbox('Choose a model'):
            selected_run_id = st.sidebar.selectbox(
                'Choose from saved models',
                allOfRuns[allOfRuns['tags.Source'] ==
                          'create_model']['run_id'].tolist())
            selected_run_info = allOfRuns[(
                allOfRuns['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('Forecast data'):
                model_uri = 'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success('Model prediction. . .')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['Predictive Data'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('Did not find a trained model')
Пример #7
0
import seaborn as sns
import plotly.express as px
import pycaret.classification as pyclf
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

st.set_page_config(layout="wide")

# load data
df = pd.read_excel('data/default of credit card clients.xls',
                   skiprows=1,
                   index_col='ID').sample(1000)

setup = pyclf.setup(df, target='default payment next month', silent=True)
lgbm = pyclf.create_model('lightgbm')
lgbm, tuner = pyclf.tune_model(lgbm, return_tuner=True)

cv_acc = round(tuner.cv_results_['mean_test_score'].mean(), 3)
st.title(f"CV Accuracy is {cv_acc}")

# EDA plots
phik_corr = df.phik_matrix()
correlogram = sns.heatmap(phik_corr)

barchart = px.histogram(df,
                        x='PAY_0',
                        color='default payment next month',
                        barmode='group')

col1, col2 = st.columns(2)
Пример #8
0
def app_main():
    st.title("自动化机器学习平台")
    if st.sidebar.checkbox('定义数据源'):
        file_folder = st.sidebar.text_input('文件夹', value="data")
        data_file_list = list_files(file_folder, 'csv')
        if len(data_file_list) == 0:
            st.warning(f'当路径无可用数据集')
        else:
            file_selected = st.sidebar.selectbox('选择文件', data_file_list)
            file_selected_path = concat_file_path(file_folder, file_selected)
            nrows = st.sidebar.number_input('行数', value=-1)
            n_rows_str = '全部' if nrows == -1 else str(nrows)
            st.info(f'已选择文件:{file_selected_path},读取行数为{n_rows_str}')
    else:
        file_selected_path = None
        nrows = 100
        st.warning(f'当前选择文件为空,请选择。')
    if st.sidebar.checkbox('探索性分析'):
        if file_selected_path is not None:
            if st.sidebar.button('一键生成报告'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info(f'没有选择文件,无法进行分析。')

    if st.sidebar.checkbox('快速建模'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('选择任务', ML_TASK_LIST)
            if task == '回归':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            elif task == '分类':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('选取预测对象', cols)
            except BaseException:
                st.sidebar.warning(f'数据格式无法正确读取')
                target_col = None

            if target_col is not None and st.sidebar.button('训练模型'):
                if task == '回归':
                    st.success(f'数据预处理。。。')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_rg.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_rg.finalize_model(model)
                    st.success(f'模型已经创建')
                elif task == '分类':
                    st.success(f'数据预处理。。。')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_cl.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_cl.finalize_model(model)
                    st.success(f'模型已经创建')
    if st.sidebar.checkbox('查看系统日志'):
        n_lines = st.sidebar.slider(label='行数', min_value=3, max_value=50)
        if st.sidebar.button("查看"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('系统日志')
            st.write(logs)
    try:
        all_runs = mlflow.search_runs(experiment_ids=0)
    except:
        all_runs = []
    if len(all_runs) != 0:
        if st.sidebar.checkbox('预览模型'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->开启mlflow,命令行输入:mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(all_runs)
        if st.sidebar.checkbox('选择模型'):
            selected_run_id = st.sidebar.selectbox(
                '从已保存模型中选择', all_runs[all_runs['tags.Source'] ==
                                      'create_model']['run_id'].tolist())
            selected_run_info = all_runs[(
                all_runs['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('预测数据'):
                model_uri = f'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success(f'模型预测中。。。   ')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['预测值'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('没有找到训练好的模型')
Пример #9
0
def classification_model(
    *,
    y_col,
    training_set,
    normalize,
    test_size,
    folds,
    metric,
    model_name,
    testing_set,
    imbalanced,
    seed,
    include_models,
    normalize_method,
):
    """
    Build a classification model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced : bool
        if True the imbalance will be fixed before the training.
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final classification model

    """
    if not metric:
        metric = 'AUC'
    setup = pycl.setup(target=y_col,
                       fix_imbalance=imbalanced,
                       normalize=normalize,
                       normalize_method=normalize_method,
                       data=training_set,
                       train_size=1 - test_size,
                       silent=True,
                       fold=folds,
                       session_id=seed)
    best_model = pycl.compare_models(sort=metric, include=include_models)
    pycl.pull().to_csv(model_name + '_compare_models.tsv',
                       sep='\t',
                       index=False)
    cl_model = pycl.create_model(best_model)
    cl_tuned_model = pycl.tune_model(cl_model, optimize=metric)
    pycl.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pycl.finalize_model(cl_tuned_model)
    pycl.plot_model(final_model, plot='pr', save=True)
    pycl.plot_model(final_model, plot='confusion_matrix', save=True)
    pycl.plot_model(final_model, plot='feature', save=True)
    pycl.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_classifier(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
Пример #10
0
#RECEIPE #2 - TUNE TOP 5 MODELS
from pycaret.classification import tune_model
tuned_top5 = [tune_model(i) for i in top5]
print(len(tuned_top5))

#RECIPE #3
from pycaret.classification import blend_models
blender = blend_models(top5, verbose=False)
print(blender)

from pycaret.classification import pull
pull()

#FINALIZE BEST MODEL
from pycaret.classification import automl
best_model = automl(optimize='MCC', use_holdout=True)
print(best_model)

t1 = time.time()
tt = round(t1 - t0, 4)

from pycaret.classification import plot_model
plot_model(best_model, plot='confusion_matrix')

from pycaret.classification import create_model
xgboost = create_model('xgboost', verbose=False)

from pycaret.classification import interpret_model
interpret_model(xgboost)

print("Succesfully Completed in {} Seconds".format(tt))
Пример #11
0
 def create_model(self, model, target):
     classification.setup(data=self.data,
                          target=target,
                          silent=True,
                          html=False)
     return classification.create_model(model)