Python preprocess示例，run_preprocess.preprocess Python示例

示例#1

0

显示文件

文件： run_inference.py 项目： Devanshchowdhury2212/dsa2

def run_predict(config_name,
                config_path,
                n_sample=-1,
                path_data=None,
                path_output=None,
                pars={},
                model_dict=None):

    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    m = model_dict['global_pars']

    model_class = model_dict['model_pars']['model_class']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']

    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    ##########################################################################################
    colid = load(f'{path_pipeline}/colid.pkl')
    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)

    from run_preprocess import preprocess_inference as preprocess
    dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars)
    ypred, yproba = predict(model_class, path_model, dfX, cols_family)

    log("############ Saving prediction  ###################################################"
        )
    log(ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df[cols_family["coly"] + "_pred"] = ypred
    if yproba is not None:
        df[cols_family["coly"] + "_pred_proba"] = yproba
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    log("###########  Export Specific ######################################################"
        )
    df[cols_family["coly"]] = ypred
    df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")

示例#2

0

显示文件

文件： run_inference.py 项目： Devanshchowdhury2212/dsa2

def run_data_check(path_data,
                   path_data_ref,
                   path_model,
                   path_output,
                   sample_ratio=0.5):
    """
     Calcualata Dataset Shift before prediction.
    """
    from run_preprocess import preprocess_inference as preprocess
    path_output = root + path_output
    path_data = root + path_data
    path_data_ref = root + path_data_ref
    path_pipeline = root + path_model + "/pipeline/"

    os.makedirs(path_output, exist_ok=True)
    colid = load(f'{path_pipeline}/colid.pkl')

    df1 = load_dataset(path_data_ref, colid=colid)
    dfX1, cols_family1 = preprocess(df1, path_pipeline)

    df2 = load_dataset(path_data, colid=colid)
    dfX2, cols_family2 = preprocess(df2, path_pipeline)

    colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"]
    dfX1 = dfX1[colsX]
    dfX2 = dfX2[colsX]

    from util_feature import pd_stat_dataset_shift
    nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio)
    metrics_psi = pd_stat_dataset_shift(dfX2,
                                        dfX1,
                                        colsX,
                                        nsample=nsample,
                                        buckets=7,
                                        axis=0)
    metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv")
    log(metrics_psi)

示例#3

0

显示文件

def run_train(config_name,
              path_data_train=None,
              path_output=None,
              config_path="source/config_model.py",
              n_sample=5000,
              mode="run_preprocess",
              model_dict=None):
    """
      Configuration of the model is in config_model.py file
    :param config_name:
    :param config_path:
    :param n_sample:
    :return:
    """
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)

    m = model_dict['global_pars']
    path_data_train = m['path_data_train']
    path_train_X = m.get('path_train_X',
                         path_data_train + "/features.zip")  #.zip
    path_train_y = m.get('path_train_y',
                         path_data_train + "/target.zip")  #.zip

    path_output = m['path_train_output']
    # path_model          = m.get('path_model',          path_output + "/model/" )
    path_pipeline = m.get('path_pipeline', path_output + "/pipeline/")
    path_features_store = m.get(
        'path_features_store', path_output + '/features_store/'
    )  #path_data_train replaced with path_output, because preprocessed files are stored there
    path_check_out = m.get('path_check_out', path_output + "/check/")
    log(path_output)

    log("#### load input column family  ##################################################"
        )
    try:
        cols_group = model_dict['data_pars'][
            'cols_input_type']  ### the model config file
    except:
        cols_group = json.load(
            open(path_data_train + "/cols_group.json", mode='r'))
    log(cols_group)

    log("#### Preprocess  ################################################################"
        )
    preprocess_pars = model_dict['model_pars']['pre_process_pars']
    #filter_pars     = model_dict['data_pars']['filter_pars']

    if mode == "run_preprocess":
        dfXy, cols = preprocess(path_train_X,
                                path_train_y,
                                path_pipeline,
                                cols_group,
                                n_sample,
                                preprocess_pars,
                                path_features_store=path_features_store)

    elif mode == "load_preprocess":  #### Load existing data
        dfXy, cols = preprocess_load(path_train_X,
                                     path_train_y,
                                     path_pipeline,
                                     cols_group,
                                     n_sample,
                                     preprocess_pars,
                                     path_features_store=path_features_store)

    ### Actual column for label y and Input X (colnum , colcat
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = sum([
        cols[colgroup]
        for colgroup in model_dict['data_pars']['cols_model_group']
    ], [])

    log("#### Train model: #############################################################"
        )
    log(str(model_dict)[:1000])
    post_process_fun = model_dict['model_pars']['post_process_fun']
    dfXy, dfXytest = train(model_dict, dfXy, cols, post_process_fun)

    log("#### Export ##################################################################"
        )
    os.makedirs(path_check_out, exist_ok=True)
    colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"]
    dfXy[colexport].reset_index().to_csv(path_check_out +
                                         "/pred_check.csv")  # Only results
    dfXy.to_parquet(path_check_out +
                    "/dfX.parquet")  # train input data generate parquet
    #dfXy.to_csv(path_check_out + "/dfX.csv")  # train input data generate csv
    dfXytest.to_parquet(
        path_check_out +
        "/dfXtest.parquet")  # Test input data  generate parquet
    #dfXytest.to_csv(path_check_out + "/dfXtest.csv")  # Test input data  generate csv
    log(
        "######### Finish #############################################################",
    )

示例#4

0

显示文件

def run_train(config_name,
              config_path="source/config_model.py",
              n_sample=5000,
              mode="run_preprocess",
              model_dict=None,
              return_mode='file',
              **kw):
    """
      Configuration of the model is in config_model.py file
    :param config_name:
    :param config_path:
    :param n_sample:
    :return:
    """
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)

    m = model_dict['global_pars']
    path_data_train = m['path_data_train']
    path_train_X = m.get('path_train_X',
                         path_data_train + "/features.zip")  #.zip
    path_train_y = m.get('path_train_y',
                         path_data_train + "/target.zip")  #.zip

    path_output = m['path_train_output']
    # path_model          = m.get('path_model',          path_output + "/model/" )
    path_pipeline = m.get('path_pipeline', path_output + "/pipeline/")
    path_features_store = m.get(
        'path_features_store', path_output + '/features_store/'
    )  #path_data_train replaced with path_output, because preprocessed files are stored there
    path_check_out = m.get('path_check_out', path_output + "/check/")
    log(path_output)

    log("#### load raw data column family, colum check  ###################################"
        )
    cols_validate(model_dict)
    cols_group = model_dict['data_pars']['cols_input_type']  ### Raw
    log2(cols_group)

    log("#### Preprocess  ################################################################"
        )
    preprocess_pars = model_dict['model_pars']['pre_process_pars']

    if mode == "run_preprocess":
        dfXy, cols = preprocess(
            path_train_X,
            path_train_y,
            path_pipeline,  ### path to save preprocessing pipeline
            cols_group,  ### dict of column family
            n_sample,
            preprocess_pars,
            path_features_store  ### Store intermediate dataframe
        )

    elif mode == "load_preprocess":  #### Load existing data
        dfXy, cols = preprocess_load(path_train_X,
                                     path_train_y,
                                     path_pipeline,
                                     cols_group,
                                     n_sample,
                                     preprocess_pars,
                                     path_features_store=path_features_store)

    log("#### Extract column names  #####################################################"
        )
    ### Actual column names for Model Input :  label y and Input X (colnum , colcat), remove duplicate names
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = list(
        set(
            sum([
                cols[colgroup]
                for colgroup in model_dict['data_pars']['cols_model_group']
            ], [])))

    #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names
    ## 'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type',
                                                       {}).items():
        model_dict['data_pars']['cols_model_type2'][colg] = list(
            set(sum([cols[colgroup] for colgroup in colg_list], [])))

    log("#### Train model: #############################################################"
        )
    log3(str(model_dict)[:1000])
    post_process_fun = model_dict['model_pars']['post_process_fun']
    dfXy, dfXytest, stats = train(model_dict, dfXy, cols, post_process_fun)

    log("#### Register model ##########################################################"
        )
    mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None)
    if mlflow_pars is not None:
        mlflow_register(dfXy, model_dict, stats, mlflow_pars)

    log("#### Export ##################################################################"
        )
    if return_mode == 'dict':
        return {'dfXy': dfXy, 'dfXytest': dfXytest, 'stats': stats}

    else:
        os.makedirs(path_check_out, exist_ok=True)
        colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"]
        if cols['coly'] + '_proba' in dfXy.columns:
            colexport.append(cols['coly'] + '_proba')
        dfXy[colexport].to_csv(path_check_out + "/pred_check.csv",
                               sep="\t")  # Only results

        dfXy.to_parquet(path_check_out +
                        "/dfX.parquet")  # train input data generate parquet
        dfXytest.to_parquet(
            path_check_out +
            "/dfXtest.parquet")  # Test input data  generate parquet

        #dfXy.to_csv(path_check_out + "/dfX.csv")  # train input data generate csv
        #dfXytest.to_csv(path_check_out + "/dfXtest.csv")  # Test input data  generate csv
        log(
            "######### Finish #############################################################",
        )

示例#5

0

显示文件

文件： run_train.py 项目： Ruhul964/dsa2

def run_train(config_name, config_path="source/config_model.py", n_sample=5000,
              mode="run_preprocess", model_dict=None, return_mode='file', **kw):
    """
      Configuration of the model is in config_model.py file
    :param config_name:
    :param config_path:
    :param n_sample:
    :return:
    """
    model_dict  = model_dict_load(model_dict, config_path, config_name, verbose=True)

    mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None)


    m           = model_dict['global_pars']
    path_data_train   = m['path_data_train']
    path_train_X      = m.get('path_train_X', path_data_train + "/features.zip") #.zip
    path_train_y      = m.get('path_train_y', path_data_train + "/target.zip")   #.zip

    path_output         = m['path_train_output']
    # path_model          = m.get('path_model',          path_output + "/model/" )
    path_pipeline       = m.get('path_pipeline',       path_output + "/pipeline/" )
    path_features_store = m.get('path_features_store', path_output + '/features_store/' )  #path_data_train replaced with path_output, because preprocessed files are stored there
    path_check_out      = m.get('path_check_out',      path_output + "/check/" )
    log(path_output)


    log("#### load input column family  ##################################################")
    try :
        cols_group = model_dict['data_pars']['cols_input_type']  ### the model config file
    except :
        cols_group = json.load(open(path_data_train + "/cols_group.json", mode='r'))
    log(cols_group)


    log("#### Preprocess  ################################################################")        
    preprocess_pars = model_dict['model_pars']['pre_process_pars']
     
    if mode == "run_preprocess" :
        dfXy, cols      = preprocess(path_train_X, path_train_y,
                                     path_pipeline,    ### path to save preprocessing pipeline
                                     cols_group,       ### dict of column family
                                     n_sample,
                                     preprocess_pars,
                                     path_features_store  ### Store intermediate dataframe
                                     )

    elif mode == "load_preprocess"  :  #### Load existing data
        dfXy, cols      = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample,
                                          preprocess_pars,  path_features_store=path_features_store)


    ### Actual column names for label y and Input X (colnum , colcat) 
    model_dict['data_pars']['coly']       = cols['coly']
    model_dict['data_pars']['cols_model'] = sum([  cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ]   , [])


    #### Col Group to model input : Sparse, continuous, .... (ie Neural Network
    ## 'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    ##
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items() :
        model_dict['data_pars']['cols_model_type2'][colg] = sum([  cols[colgroup] for colgroup in colg_list ]   , [])


   
    log("#### Train model: #############################################################")
    log(str(model_dict)[:1000])
    post_process_fun      = model_dict['model_pars']['post_process_fun']
    dfXy, dfXytest,stats  = train(model_dict, dfXy, cols, post_process_fun)

    if mlflow_pars is not None:
        log("#### Using mlflow #########################################################")
        # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"):
        from run_mlflow import register
        from mlflow.models.signature import infer_signature

        train_signature = dfXy[model_dict['data_pars']['cols_model']]
        y_signature     = dfXy[model_dict['data_pars']['coly']]
        signature       = infer_signature(train_signature, y_signature)

        register( run_name    = model_dict['global_pars']['config_name'],
                 params       = model_dict['global_pars'],
                 metrics      = stats["metrics_test"],
                 signature    = signature,
                 model_class  = model_dict['model_pars']["model_class"],
                 tracking_uri = mlflow_pars.get( 'tracking_db', "sqlite:///mlflow_local.db")
                )


    if return_mode == 'dict' :
        return { 'dfXy' : dfXy, 'dfXytest': dfXytest, 'stats' : stats   }

    else :
        log("#### Export ##################################################################")
        os.makedirs(path_check_out, exist_ok=True)
        colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"]
        dfXy[colexport].reset_index().to_csv(path_check_out + "/pred_check.csv")  # Only results
        dfXy.to_parquet(path_check_out + "/dfX.parquet")  # train input data generate parquet
        #dfXy.to_csv(path_check_out + "/dfX.csv")  # train input data generate csv
        dfXytest.to_parquet(path_check_out + "/dfXtest.parquet")  # Test input data  generate parquet
        #dfXytest.to_csv(path_check_out + "/dfXtest.csv")  # Test input data  generate csv
        log("######### Finish #############################################################", )

示例#6

0

显示文件

def run_predict(config_name,
                config_path,
                n_sample=-1,
                path_data=None,
                path_output=None,
                pars={},
                model_dict=None):

    log("#### Run predict  ###############################################################"
        )
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    model_class = model_dict['model_pars']['model_class']

    m = model_dict['global_pars']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']
    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    log("#### Run preprocess  ###########################################################"
        )
    from run_preprocess import preprocess_inference as preprocess
    colid = load(f'{path_pipeline}/colid.pkl')
    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)
    dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars)
    coly = cols["coly"]

    log("#### Extract column names  #########################################################"
        )
    ### Actual column names for Model Input :  label y and Input X (colnum , colcat), remove duplicate names
    ###  [  'colcat', 'colnum'
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = list(
        set(
            sum([
                cols[colgroup]
                for colgroup in model_dict['data_pars']['cols_model_group']
            ], [])))

    #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names
    ## 'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type',
                                                       {}).items():
        model_dict['data_pars']['cols_model_type2'][colg] = list(
            set(sum([cols[colgroup] for colgroup in colg_list], [])))

    log("############ Prediction  ##########################################################"
        )
    ypred, yproba = predict(model_class, path_model, dfX, cols, model_dict)

    post_process_fun = model_dict['model_pars']['post_process_fun']
    df[coly + "_pred"] = ypred
    df[coly + "_pred"] = df[coly +
                            '_pred'].apply(lambda x: post_process_fun(x))
    if yproba is not None:
        df[coly + "_pred_proba"] = yproba

    log("############ Saving prediction  ###################################################"
        )
    log(ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    log("###########  Export Specific ######################################################"
        )
    df[cols["coly"]] = ypred
    df[[cols["coly"]]].to_csv(f"{path_output}/pred_only.csv")

示例#7

0

显示文件

def run_transform(config_name,
                  config_path,
                  n_sample=1,
                  path_data=None,
                  path_output=None,
                  pars={},
                  model_dict=None,
                  return_mode=""):

    log("##### Run transform ###############################################################"
        )
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    model_class = model_dict['model_pars']['model_class']

    m = model_dict['global_pars']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']

    model_file = m.get('model_file', "")  ### New

    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    log("##### Load Preprocess ############################################################"
        )
    from run_preprocess import preprocess_inference as preprocess
    colid = load(f'{path_pipeline}/colid.pkl')
    if model_class in SUPERVISED_MODELS:
        path_pred_X = m.get('path_pred_X', path_data + "/features.zip")  #.zip
        path_pred_y = m.get('path_pred_y', path_data + "/target.zip")  #.zip
        df = load_dataset(path_pred_X, path_pred_y, colid, n_sample=n_sample)
    else:
        df = load_dataset(path_data, None, colid, n_sample=n_sample)

    dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars)
    coly = cols["coly"]

    log("#### Extract column names  #######################################################"
        )
    ### Actual column names for Model Input :  label y and Input X (colnum , colcat), remove duplicate names
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = list(
        set(
            sum([
                cols[colgroup]
                for colgroup in model_dict['data_pars']['cols_model_group']
            ], [])))

    ####    Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names
    ####   'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type',
                                                       {}).items():
        model_dict['data_pars']['cols_model_type2'][colg] = list(
            set(sum([cols[colgroup] for colgroup in colg_list], [])))

    log("############ Task Inference   ###################################################"
        )
    task_type = model_dict['compute_pars'].get('task_inference', 'transform')
    if model_class in SUPERVISED_MODELS:
        dfXy = transform(
            model_file,
            path_model,
            (dfX[[c for c in dfX.columns if c not in coly]], df[coly]),
            model_dict,
            task_type=task_type)
    else:
        dfXy = transform(model_file,
                         path_model,
                         dfX,
                         model_dict,
                         task_type=task_type)

    post_process_fun = model_dict['model_pars']['post_process_fun']

    if return_mode == 'dict':
        return {'dfXy': dfXy}

    else:
        log("#### Export ##################################################################"
            )
        path_check_out = m.get('path_check_out', path_output + "/check/")
        os.makedirs(path_check_out, exist_ok=True)
        dfX.to_parquet(path_check_out +
                       "/dfX.parquet")  # train input data generate parquet
        log(
            "######### Finish #############################################################",
        )